library(Matrix)
library(cluster)
library(lattice)
library(plotrix)
library(gridExtra)
library(KernSmooth)
library(boot)
library(parallel)
library(texreg)
library(nnet)
library(xtable)
library(lpSolve)
library(knitr)

.default_par <- par(no.readonly=TRUE)

BASEDIR <- sprintf('%s/github/post-mortem-memory', Sys.getenv('HOME'))
DATADIR <- sprintf('%s/data/', BASEDIR)
PLOTDIR <- sprintf('%s/plots/', BASEDIR)
TABLEDIR <- sprintf('%s/tables/', BASEDIR)

fignum <- 0
tabnum <- 0
figcap <- function(increment) {
  if (increment) fignum <<- fignum + 1
  sprintf('**%sFigure %s:** ', if (SUPP_INFO) 'Supplementary ' else '', fignum)
}
tabcap <- function(increment) {
  if (increment) tabnum <<- tabnum + 1
  sprintf('**%sTable %s:** ', if (SUPP_INFO) 'Supplementary ' else '', tabnum)
}

vertskip <- '$$\\\\[4mm]$$'

# We aggregate chunk_size days; a value of 7 means we aggregate by weeks.
CHUNK_SIZE <- 1

# The number of time units ('chunks') before and after death.
N <- 360

COL <- list(NEWS=rgb(.9,.6,0), TWITTER=rgb(0,.45,.7))
COL_LIGHT <- list(NEWS=rgb(.9,.6,0,.3), TWITTER=rgb(0,.45,.7,.3))
COL_XLIGHT <- list(NEWS=rgb(.9,.6,0,.03), TWITTER=rgb(0,.45,.7,.03))
COL_LIGHTBLUE <- "#56B4E9"
COL_YELLOW <- "#F0E442"
COL_DARKBLUE <- "#0072B2"
COL_RED <- "#D55E00"
COL_MAGENTA <- "#CC79A7"
COL_GRAY <- "#999999"
COL_ORANGE <- "#E69F00"
COL_GREEN <- "#009E73"

ANGLO_COUNTRY_REGEX <- 'United States|United Kingdom|Canada|England|Australia|Scotland|Wales|Ireland|New Zealand|South Africa'

type_groups <- c('art', 'sports', 'leadership', 'known for death', 'general fame', 'academia/engineering')

split_at <- function(str, sep) strsplit(str, sep)[[1]]

fancy_mid <- function(mid) sub('<http://rdf.freebase.com/ns/m.(.*)>', '/m/\\1', mid)
long_mid <- function(mid) sub('/m/(.*)', '<http://rdf.freebase.com/ns/m.\\1>', mid)

fancy_wiki <- function(wiki) sub('<http://en.wikipedia.org/wiki/(.*)>', '\\1', wiki)
long_wiki <- function(wiki) sprintf('<http://en.wikipedia.org/wiki/%s>', wiki)

FANCY_CURVE_CHAR_NAMES <- c('Pre-mortem mean', 'Short-term boost', 'Long-term boost', 'Halving time')
names(FANCY_CURVE_CHAR_NAMES) <- c('mean_before', 'peak_mean_boost', 'perm_boost', 'time_till_half')

# A list of all days.
MIN_DATE <- as.Date("2008-08-01")
MAX_DATE <- as.Date("2014-09-30")
DAYS <- as.character(seq(MIN_DATE, MAX_DATE, by="+1 day"))
MAX_ABS_RELDATE <- as.numeric(MAX_DATE - MIN_DATE)

# Our Twitter data set is reasonably large only from 2009-06-11 on.
TWITTER_START_DATE <- as.Date("2009-06-11")

# We build a matrix x that has a row per person, and a column per day since death. There are
# nreldates = 2*(MAX_DATE-MIN_DATE)+1 such possible dates: the extreme cases are someone dying on
# MIN_DATE or MAX_DATE, so the relDates are [0 : MAX_DATE-MIN_DATE] in the former case, and
# [MIN_DATE-MAX_DATE : 0] in the latter case, for a total range of [MIN_DATE-MAX_DATE :
# MAX_DATE-MIN_DATE]; there are 2*(MAX_DATE-MIN_DATE)+1 values in this range.
# Values of relDate can be negative, zero, or positive. However, since R indices must be
# positive, we must translate relDates to get indices; the translation is as such:
# idx = relDate + MAX_DATE - MIN_DATE + 1
NRELDATES <- 2 * as.numeric(MAX_DATE - MIN_DATE) + 1

# The days on which our Spinn3r client must have been broken (there are still several tens of
# thousands of articles with dates from these days, but they were crawled on days outside of this
# window).
EMPTY_DAYS <- c(
  "2009-05-19", "2009-07-15", "2010-04-05", "2010-04-20", "2010-04-21", "2010-04-22", "2010-04-23", "2010-04-24",
  "2010-04-25", "2010-04-26", "2010-04-27", "2010-04-28", "2010-04-29", "2010-04-30", "2010-05-01", "2010-05-02", 
  "2010-05-03", "2010-05-04", "2010-05-05", "2010-05-06", "2010-05-07", "2010-05-08", "2010-05-09", "2010-05-10",
  "2010-05-11", "2010-05-12", "2010-05-13", "2010-05-16", "2010-05-17", "2010-05-18", "2010-05-19", "2010-05-20",
  "2010-05-21", "2010-05-22", "2010-05-23", "2010-05-24", "2010-05-25", "2010-05-26", "2010-05-27", "2010-05-28",
  "2010-05-29", "2010-05-30", "2010-05-31", "2010-06-01", "2010-06-02", "2010-06-03", "2010-06-04", "2010-06-05",
  "2010-06-06", "2010-06-07", "2010-06-08", "2010-06-09", "2010-06-10", "2010-06-11", "2010-06-12", "2010-06-13",
  "2010-06-14", "2010-06-15", "2010-06-16", "2010-06-17", "2010-06-18", "2010-06-19", "2010-06-20", "2010-06-21",
  "2010-06-22", "2010-06-23", "2010-06-24", "2010-06-25", "2010-06-26", "2010-06-27", "2010-06-28", "2010-06-29",
  "2010-06-30", "2010-07-01", "2010-07-02", "2010-07-03", "2010-07-04", "2010-07-05", "2010-07-06", "2010-07-07",
  "2010-07-08", "2010-07-09", "2010-07-10", "2010-07-11", "2010-07-12", "2010-07-13", "2011-12-03")

# The dates of death.
death_dates_tbl <- read.table(pipe(sprintf('gunzip -c %s/date_of_death.nt', DATADIR)),
                              comment.char='', sep='\t', quote='',
                              col.names=c('mid', '_rel', 'date', '_period'))[,c('mid', 'date')]
death_dates_tbl <- death_dates_tbl[grepl('#date>$', death_dates_tbl$date),]
death_dates_tbl$date <- substring(death_dates_tbl$date, 2, 11)
death_dates <- death_dates_tbl$date
names(death_dates) <- death_dates_tbl$mid

# Our Twitter data set is reasonably large only from 2009-06-11 on.
# Get the mids of the people that died between then and MAX_DATE (Sept. 2014).
died_in_window <- names(death_dates[as.Date(death_dates) >= TWITTER_START_DATE
                                    & as.Date(death_dates) <= MAX_DATE])

# A mapping from mids to Wikipedia names.
wiki_tbl <- read.table(pipe(sprintf('gunzip -c %s/names.DEAD.UNAMBIGUOUS.tsv.gz', DATADIR)),
                       comment.char='', sep='\t', quote='',
                       col.names=c('mid', 'names', 'aliases', 'curid', 'wiki'),
                       stringsAsFactors=FALSE)
wiki_tbl$wiki <- sapply(wiki_tbl$wiki, function(x) split_at(x, '\\|')[1])
mid_to_wiki <- wiki_tbl$wiki
names(mid_to_wiki) <- wiki_tbl$mid
mid_to_wiki <- mid_to_wiki[!is.na(mid_to_wiki)]

# A mapping from mids to names.
wiki_to_mid <- names(mid_to_wiki)
names(wiki_to_mid) <- mid_to_wiki

# Properties of dead people.
props <- read.table(pipe(sprintf('gunzip -c %s/dead_people_properties.tsv.gz', DATADIR)),
                    comment.char='', sep='\t', quote='', header=TRUE, fill=TRUE,
                    col.names=c('person', 'cause_of_death', 'date_of_death', 'place_of_death', 'date_of_burial',
                                'place_of_burial', 'date_of_cremation', 'place_of_cremation', 'date_of_birth',
                                'place_of_birth', 'nationality', 'profession', 'religion', 'ethnicity',
                                'notable_types', 'gender'))
props$mid <- sub('(<.*>).*', '\\1', props$person)
props$gender <- as.factor(sub('<.*>(.*)', '\\1', props$gender))
death_years <- sub('"(....).*', '\\1', props$date_of_death)
birth_years <- sub('"(....).*', '\\1', props$date_of_birth)
death_years[!grepl("....", death_years)] <- NA
birth_years[!grepl("....", birth_years)] <- NA
props$age <- as.numeric(death_years) - as.numeric(birth_years)
rownames(props) <- props$mid

# Load the taxonomies.
tax_causes <- read.table(sprintf('%s/taxonomy_causes_of_death.tsv', DATADIR),
                         header=TRUE, sep='\t', comment.char='', quote='')
rownames(tax_causes) <- paste(tax_causes$mid, tax_causes$cause_of_death, sep='')
tax_types <- read.table(sprintf('%s/taxonomy_notable_types.tsv', DATADIR),
                        header=TRUE, sep='\t', comment.char='', quote='')
rownames(tax_types) <- paste(tax_types$mid, tax_types$notable_type, sep='')

# Some causes of death are missing from Janice's table. Add them manually.
natural_manual <- "Viral pneumonia|Smallpox|Dementia with Lewy bodies|Heart valve disease|Creutzfeldt–Jakob disease|T-Cell Lymphoma|Adrenocortical carcinoma|Huntington's disease|Congenital heart defect|Squamous-cell carcinoma|Atypical teratoid rhabdoid tumor|Alveolar rhabdomyosarcoma|Appendix cancer|Pyelonephritis|Polymyalgia rheumatica|Polycythemia|Leiomyosarcoma|Astrocytoma"
unnatural_manual <- "Smoke inhalation|Racing Accident|Lightning|Casualty of war|Cocaine overdose|Poisoning|Shootout|Murder–suicide|Accidental drug overdose|Blast injury"

# Create the regexes for (un)natural deaths.
natural_regex <- sprintf(">(%s|%s)($|\\|)",
                         paste(tax_causes$cause.of.death[tax_causes$level1=='natural'],
                               collapse='|'), natural_manual)
unnatural_regex <- sprintf(">(%s|%s)($|\\|)",
                           paste(tax_causes$cause.of.death[tax_causes$level1=='unnatural'],
                                 collapse='|'), unnatural_manual)

# -1 = unnatural, 0 = unknown/conflicting, 1 = natural.
get_cause_of_death_map <- function() {
  natural_death_mids <- props$mid[which(grepl(natural_regex, props$cause_of_death))]
  unnatural_death_mids <- props$mid[which(grepl(unnatural_regex, props$cause_of_death))]
  map <- (props$mid %in% natural_death_mids) - (props$mid %in% unnatural_death_mids)
  # If a person has a natural and an unnatural cause, we want to list them under unnatural death.
  map[props$mid %in% natural_death_mids & props$mid %in% unnatural_death_mids] <- -1
  names(map) <- props$mid
  return(map)
}

# e.g., <http://rdf.freebase.com/ns/m.025698c>Baseball Player --> <http://rdf.freebase.com/ns/m.025698c>
strip_plaintext_name_from_mid <- function(mid_with_name) {
  sub('(<.*>).*', '\\1', mid_with_name)
}

# tax must be either tax_types or tax_causes;
# data must be a vector with taxonomy entries as values and mids as names.
select_from_tax <- function(level, value, tax, data) {
  col <- sprintf('level%d', level)
  ok_values <- rownames(tax)[tax[,col] == value]
  names(data)[data %in% ok_values]
}

get_num_art <- function(medium) {
  if (medium != 'NEWS' && medium != 'TWITTER') {
    stop('Medium must be \'NEWS\' or \'TWITTER\'')
  }
  num_art_tbl <- read.table(sprintf('%s/num_articles_per_day_%s.tsv', DATADIR, medium),
                            col.names=c('date', 'num'))
  num_art <- num_art_tbl$num
  names(num_art) <- num_art_tbl$date
  num_art <- num_art[names(num_art) >= as.character(MIN_DATE) & names(num_art) <= as.character(MAX_DATE)]
  num_art[setdiff(DAYS, names(num_art))] <- 0
  num_art <- num_art[order(names(num_art))]
  num_art[EMPTY_DAYS] <- NA
  return(num_art)
}

get_mention_freq_table <- function(medium) {
  if (medium != 'NEWS' && medium != 'TWITTER') {
    stop('Medium must be \'NEWS\' or \'TWITTER\'')
  }
  file <- sprintf('%s/RData/dead_people_mentions_%s.RData', DATADIR, medium)
  if (!file.exists(file)) {
    data <- read.table(pipe(sprintf('gunzip -c %s/num_dead_mentions_per_day_%s.tsv.gz', DATADIR, medium)),
                       comment.char='', sep='\t', quote='',
                       col.names=c('mid', 'date', 'num_per_doc', 'num_doc'),
                       colClasses=c('character', 'character', 'numeric', 'numeric'))
    # Add a column having the number of days since death.
    data$rel_date <- as.numeric(as.Date(data$date) - as.Date(death_dates[data$mid]))
    save(data, file=file)
  } else {
    load(file)
  }
  return(data)
}

get_rel_date_matrix <- function(medium, data, num_art, chunk_size) {
  if (medium == 'NEWS') {
    min_num_per_doc <- 2
  } else if (medium == 'TWITTER') {
    min_num_per_doc <- 1
  } else {
    stop('Medium must be \'NEWS\' or \'TWITTER\'')
  }
  chunks <- floor(((1:NRELDATES)-(NRELDATES+1)/2)/chunk_size)
  sum.na.rm <- function(x) sum(x, na.rm=TRUE)
  idx <- which(data$num_per_doc >= min_num_per_doc)
  file <- sprintf('%s/RData/num_mentions_per_rel_date_%s_min_num_per_doc=%s_chunk_size=%s.RData',
                  DATADIR, medium, min_num_per_doc, chunk_size)
  if (!file.exists(file)) {
    x <- do.call(rbind, mclapply(split(data[idx,], data$mid[idx]), function(l) {
      dod <- as.Date(death_dates[l$mid[1]])
      # Aggregate the counts of docs containing 1, 2, >=3 mentions.
      l <- data.frame(t(simplify2array(by(l, l$date, function(X) c(
        as.numeric(as.character(X$rel_date[1])),
        as.numeric(as.character(sum(X$num_doc))))))))
      colnames(l) <- c('rel_date', 'num_doc')
      l$date <- rownames(l)
      # Initialize y, the count vector for the current mid.
      # y has 2*(MAX_DATE-MIN_DATE)+1 entries, of which only MAX_DATE-MIN_DATE+1 are well-defined (as per
      # our discussion of NRELDATES above).
      # For the MAX_DATE-MIN_DATE days for which we have no data, the values are set to NA.
      # We also set to NA the values for the days that have no Spinn3r data (primarily the 2010 hole).
      # For the remaining MAX_DATE-MIN_DATE+1 days, we initialize values to 0.
      offset <- as.numeric(MAX_DATE-MIN_DATE) + 1
      y <- rep(NA, NRELDATES)
      y[as.numeric(as.Date(DAYS)-dod) + offset] <- 0
      y[l$rel_date + offset] <- l$num_doc
      y[as.numeric(as.Date(EMPTY_DAYS)-dod) + offset] <- NA
      # n: the number of docs per day aligned in terms of relative dates.
      n <- rep(0, NRELDATES)
      n[as.numeric(as.Date(DAYS)-dod) + offset] <- num_art
      # Sum within chunks.
      s <- tapply(y, chunks, sum.na.rm) / tapply(n, chunks, sum.na.rm)
      # Set to NA the days before TWITTER_START_DATE in the case of Twitter.
      if (medium == 'TWITTER') {
        s[as.numeric(as.Date(DAYS[as.Date(DAYS) < TWITTER_START_DATE])-dod) + offset] <- NA
      }
      return(s)
    }, mc.cores=6))
    x <- as.matrix(x)
    save(x, file=file)
  } else {
    load(file)
  }
  return(x)
}

filter_people <- function(medium, x) {
  N_immed_after <- 100
  num_finite_before <- rowSums(is.finite(x[,colnames(x) %in% -N:-1]))
  num_finite_immed_after <- rowSums(is.finite(x[,colnames(x) %in% 0:(N_immed_after-1)]))
  num_active_before <- rowSums(x[,colnames(x) %in% -N:-1] > 0, na.rm=TRUE)
  mids <- names(which(
    # Keep only people whose boundaries aren't missing.
    is.finite(x[,colnames(x)==N]) & is.finite(x[,colnames(x)==-N])
    # Keep only people that have no missing data in the N_immed_after days immediately after death.
    & num_finite_immed_after == N_immed_after
    # Keep only people that were mentioned on at least 5 days before they died.
    & num_active_before >= 5
    # Keep only people that died after the point from which on we have a lot of Twitter data.
    & rownames(x) %in% died_in_window
    # Discard people with parentheses in their names because those names, although unique in
    # Wikipedia, are unlikely to ever be used in real prose.
    & !grepl('\\(', mid_to_wiki[rownames(x)])))
  return(mids)
}

# Apply Friedman's super smoother, hell yeah.
supersmooth <- function(y) {
  reldates <- as.numeric(names(y))
  suppressWarnings({
    smoothed_left <- supsmu(reldates[reldates<0], y[reldates<0])
    smoothed_right <- supsmu(reldates[reldates>=0], y[reldates>=0])
  })
  smoothed <- c(smoothed_left$y, smoothed_right$y)
  names(smoothed) <- c(smoothed_left$x, smoothed_right$x)
  return(smoothed)
}

normalize_and_smooth <- function(medium, x, num_art, mean_center=TRUE) {
  if (medium != 'NEWS' && medium != 'TWITTER') {
    stop('Medium must be \'NEWS\' or \'TWITTER\'')
  }
  # Smoothing term: we add 1 mention (as computed on the highest-volume day) to each day.
  eps <- 1 / max(num_art, na.rm=TRUE)
  file <- sprintf('%s/RData/clustering_input_%s%s.RData', DATADIR, medium, if (mean_center) '_MEANCENTERED' else '')
  if (!file.exists(file)) {
    # Select the subset of reldates; keep a month of padding, so smoothing is more robust at the
    # boundaries,
    X <- x[,colnames(x) %in% -(N+30):(N+30)]
    reldates <- as.numeric(colnames(X))
    # Add-eps smoothing.
    X <- X + eps
    # Normalize and smooth all time series.
    X <- t(apply(X, 1, function(y_unnorm) {
      # Log10-transform.
      y <- log10(y_unnorm)
      # Smooth.
      y <- supersmooth(y)
      # Subtract the pre-mortem mean. Now we have log10(y_unnorm/mean*(y_unnorm)), where mean* is the
      # exponential of the mean in log space (i.e., a more robust version of the mean).
      # In the mean computation we exclude the 30 days immediately before death, to mitigate the
      # impact of the final sick days for people whose death was foreseeable.
      if (mean_center) y <- y - mean(y[reldates %in% -N:-30], na.rm=TRUE)
      # Select the subset of reldates.
      y <- y[names(y) %in% -N:N]
      # Interpolate missing values; rule=2 means that at the boundaries the values at the boundaries are
      # imputed for missing values. (But since we require our time series not to end in a missing value,
      # this shouldn't happen in the post-mortem period.)
      y <- approx(names(y), y, -N:N, rule=2)$y
      return(y)
    }))
    colnames(X) <- -N:N
    save(X, file=file)
  } else {
    load(file)
  }
  return(X)
}

# Normalize the time series to [0,1] and measure the fraction of time it takes until half of the
# total post-mortem volume has been seen. The simplest version maps the min to 0, but this is quite
# sensitive to low outliers, so we also allow for versions where we first set to 0 all values
# less than the specified quantile.
time_till_half <- function(y, thresh_quantile=0) {
  # Start with the day after death, since the news might not be reporting yet on the day of.
  y <- y[as.numeric(names(y)) >= 1]
  y <- pmax(y, quantile(y, thresh_quantile))
  y <- (y - min(y)) / (max(y) - min(y))
  min(which(cumsum(y) >= 0.5*sum(y))) / length(y)
}

compute_curve_stats <- function(medium, x, X, num_art) {
  if (medium != 'NEWS' && medium != 'TWITTER') {
    stop('Medium must be \'NEWS\' or \'TWITTER\'')
  }

  file <- sprintf('%s/mention_freq_for_spreadsheet_%s.csv', DATADIR, medium)
  if (!file.exists(file)) {
    # Find the people that died (un)natural deaths.
    mids <- rownames(x)
    mid_nat <- intersect(mids, names(which(get_cause_of_death_map()==1)))
    mid_unnat <- intersect(mids, names(which(get_cause_of_death_map()==-1)))

    # The number of days we consider before and after death.
    subrange_before <- -N:-30
    subrange_peak <- 0:29
    subrange_after <- 30:N
    
    x <- x[,colnames(x) %in% -N:N]
    
    num_active <- rowSums(x > 0, na.rm=TRUE)
    num_active_before <- rowSums(x[,colnames(x) %in% -N:-1] > 0, na.rm=TRUE)
    num_active_after <- rowSums(x[,colnames(x) %in% 0:N] > 0, na.rm=TRUE)
    
    num_finite_before <- rowSums(is.finite(x[,colnames(x) %in% -N:-1]))
    num_finite_after <- rowSums(is.finite(x[,colnames(x) %in% 0:N]))
    
    stats <- data.frame(mid=NA,
                        name=NA,
                        image=NA,
                        num_active_days=NA,
                        active_fraction_before=NA,
                        active_fraction_after=NA,
                        max_smoothed_before=NA,
                        max_raw_before=NA,
                        mean_before=NA,
                        mean_after=NA,
                        mean_after_peak=NA,
                        peak_smoothed=NA,
                        peak_raw=NA,
                        peak_raw_day=NA,
                        time_till_half=NA,
                        time_till_half_thresh=NA,
                        natural_death=NA,
                        unnatural_death=NA,
                        gender=NA,
                        cause_of_death=NA,
                        age=NA,
                        notable_type=NA)[-1,]

    for (i in 1:nrow(X)) {
      if (i %% 100 == 0) print(i)
      mid <- rownames(X)[i]
      name <- fancy_wiki(mid_to_wiki[mid])
      img_name <- if (!is.na(name)) gsub('(%(25)?..)+', '+', name) else sprintf('NA_%s', gsub('.*/m\\.(.*)>', '\\1', mid))
      raw <- log10(x[mid,])
      smoothed <- X[mid,]
      
      mean_before <- mean(smoothed[names(smoothed) %in% subrange_before], na.rm=TRUE)
      peak_smoothed <- max(smoothed[names(smoothed) %in% subrange_peak])
      peak_raw <- max(raw[names(raw) %in% subrange_peak])
      # -1 such that 0 represents day of death.
      peak_raw_day <- which.max(raw[names(raw) %in% subrange_peak]) - 1
      
      stats[mid,] <- c(
        fancy_mid(mid),
        name,
        sprintf('=image("http://infolab.stanford.edu/~west1/death/mention_freq_curves/%s/%s.png")',
                medium, img_name),
        num_active[mid],
        num_active_before[mid]/num_finite_before[mid],
        num_active_after[mid]/num_finite_after[mid],
        max(smoothed[names(smoothed) %in% subrange_before], na.rm=TRUE),
        max(raw[names(raw) %in% subrange_before], na.rm=TRUE),
        mean_before,
        mean(smoothed[names(smoothed) %in% c(subrange_peak, subrange_after)]),
        mean(smoothed[names(smoothed) %in% subrange_after]),
        peak_smoothed,
        peak_raw,
        peak_raw_day,
        time_till_half(smoothed),
        time_till_half(smoothed, 0.25),
        if (mid %in% mid_nat) 1 else 0,
        if (mid %in% mid_unnat) 1 else 0,
        as.character(props[mid, 'gender']),
        as.character(props[mid, 'cause_of_death']),
        props[mid, 'age'],
        as.character(props[mid, 'notable_types'])
      )
    }

    # Write the data into a CSV file.
    write.table(stats, file, quote=FALSE, sep='\t', row.names=FALSE)
  } else {
    stats <- read.table(file, header=TRUE, sep='\t', comment.char='', quote='')
  }
  return(stats)
}

load_stats <- function(medium, x, X, num_art) {
  if (medium != 'NEWS' && medium != 'TWITTER') {
    stop('Medium must be \'NEWS\' or \'TWITTER\'')
  }
  stats <- compute_curve_stats(medium, x, X, num_art)
  rownames(stats) <- long_mid(stats$mid)
  # If the person has 0 mentions on the day of death, the log will be -Inf.
  # Replace those by the smallest finite number.
  stats$peak_raw[!is.finite(stats$peak_raw)] <- min(stats$peak_raw[is.finite(stats$peak_raw)])
  # Since the quantities are logarithmic (log10), taking the difference is equivalent to the log10
  # of the ratio.
  stats$peak_mean_boost <- stats$peak_raw - stats$mean_before
  stats$peak_max_boost <- stats$peak_raw - stats$max_raw_before
  stats$perm_boost <- stats$mean_after_peak - stats$mean_before
  stats$perm_boost_all <- stats$mean_after - stats$mean_before
  # Some rank transforms.
  r <- function(x) {
    n <- length(x)
    # Keys: indices in x; values: ranks.
    rank_map <- 1:n
    names(rank_map) <- order(x)
    rank_map[as.character(1:n)]
  }
  stats$mean_before_rank <- r(stats$mean_before)
  stats$mean_after_rank <- r(stats$mean_after)
  stats$peak_mean_boost_rank <- r(stats$peak_mean_boost)
  stats$perm_boost_rank <- r(stats$perm_boost)
  stats$perm_boost_all_rank <- r(stats$perm_boost_all)
  stats$time_till_half_rank <- r(stats$time_till_half)
  # Add a categorical death-type column.
  stats$death_type <- 'unknown'
  stats$death_type[stats$natural_death==1] <- 'natural'
  stats$death_type[stats$unnatural_death==1] <- 'unnatural'
  stats$death_type <- as.factor(stats$death_type)
  # Add age groups.
  stats$age_group <- floor(stats$age/10)*10
  # Add anglo flag.
  country_known <- props$mid[props$nationality!='']
  anglos <- props$mid[grepl(ANGLO_COUNTRY_REGEX, props$nationality)]
  stats$anglo <- 'unknown'
  stats$anglo[rownames(stats) %in% country_known] <- 'non_anglo'
  stats$anglo[rownames(stats) %in% anglos] <- 'anglo'
  stats$anglo <- as.factor(stats$anglo)
  # Notable types for all people.
  types <- tax_types[strip_plaintext_name_from_mid(props$notable_types), 'level1']
  names(types) <- props$mid
  stats$type_group <- types[rownames(stats)]
  return(stats)
}

# The stats of all people, rather than just the couple of thousands included in our study.
load_stats_all <- function() {
  stats_all <- props[props$mid %in% died_in_window, c('gender', 'age', 'cause_of_death', 'nationality', 'notable_types')]
  stats_all$death_type <- 'unknown'
  stats_all$death_type[rownames(stats_all) %in% names(which(get_cause_of_death_map()==1))] <- 'natural'
  stats_all$death_type[rownames(stats_all) %in% names(which(get_cause_of_death_map()==-1))] <- 'unnatural'
  stats_all$death_type <- as.factor(stats_all$death_type)
  country_known <- rownames(stats_all)[stats_all$nationality!='']
  anglos <- rownames(stats_all)[grepl(ANGLO_COUNTRY_REGEX, stats_all$nationality)]
  stats_all$anglo <- 'unknown'
  stats_all$anglo[rownames(stats_all) %in% country_known] <- 'non_anglo'
  stats_all$anglo[rownames(stats_all) %in% anglos] <- 'anglo'
  stats_all$anglo <- as.factor(stats_all$anglo)
  types <- tax_types[strip_plaintext_name_from_mid(stats_all$notable_types), 'level1']
  names(types) <- rownames(stats_all)
  stats_all$type_group <- types[rownames(stats_all)]
  return(stats_all)
}

make_bio_table <- function() {
  to_percentage <- function(x) sprintf('%.0f%%', x*100)
  summary_table_all <- NULL
  summary_table_filtered <- NULL
  summary_table_lm <- NULL
  
  ### Age
  
  summary_table_filtered['Age'] <- ''
  summary_table_all['Age'] <- ''
  summary_table_lm['Age'] <- ''
  
  summary_table_filtered['Age N/A'] <- to_percentage(mean(is.na(stats_N$age)))
  summary_table_filtered[c('1st quartile', 'Mean', 'Median', '3rd quartile')] <- sprintf('%.0f', summary(stats_N$age)[c(2,4,3,5)])
  summary_table_all['Age N/A'] <- to_percentage(mean(is.na(stats_all$age)))
  summary_table_all[c('1st quartile', 'Mean', 'Median', '3rd quartile')] <- sprintf('%.0f', summary(stats_all$age[stats_all$age <= 120])[c(2,4,3,5)])
  summary_table_lm['Age N/A'] <- to_percentage(mean(is.na(lmdata_N$age)))
  summary_table_lm[c('1st quartile', 'Mean', 'Median', '3rd quartile')] <- sprintf('%.0f', summary(lmdata_N$age)[c(2,4,3,5)])
  
  ### Gender
  
  summary_table_filtered['Gender'] <- ''
  summary_table_all['Gender'] <- ''
  summary_table_lm['Gender'] <- ''
  
  idx <- stats_N$gender %in% c('Female', 'Male')
  summary_table_filtered['Gender N/A'] <- to_percentage(mean(!idx))
  summary_table_filtered[c('Female', 'Male')] <- to_percentage((summary(stats_N$gender[idx]) / sum(summary(stats_N$gender[idx])))[c('Female', 'Male')])
  
  idx <- stats_all$gender %in% c('Female', 'Male')
  summary_table_all['Gender N/A'] <- to_percentage(mean(!idx))
  summary_table_all[c('Female', 'Male')] <- to_percentage((summary(stats_all$gender[idx]) / sum(summary(stats_all$gender[idx])))[c('Female', 'Male')])
  
  idx <- lmdata_N$gender %in% c('Female', 'Male')
  summary_table_lm['Gender N/A'] <- to_percentage(mean(!idx))
  summary_table_lm[c('Female', 'Male')] <- to_percentage((summary(lmdata_N$gender[idx]) / sum(summary(lmdata_N$gender[idx])))[c('Female', 'Male')])
  
  ### Manner of death
  
  summary_table_filtered['Manner of death'] <- ''
  summary_table_all['Manner of death'] <- ''
  summary_table_lm['Manner of death'] <- ''
  
  idx <- stats_N$death_type != 'unknown'
  summary_table_filtered['Manner of death N/A'] <- to_percentage(mean(!idx))
  summary_table_filtered[c('Natural', 'Unnatural')] <-
    to_percentage((summary(stats_N$death_type[idx]) / sum(summary(stats_N$death_type[idx])))[c('natural', 'unnatural')])
  
  idx <- stats_all$death_type != 'unknown'
  summary_table_all['Manner of death N/A'] <- to_percentage(mean(!idx))
  summary_table_all[c('Natural', 'Unnatural')] <-
    to_percentage((summary(stats_all$death_type[idx]) / sum(summary(stats_all$death_type[idx])))[c('natural', 'unnatural')])
  
  idx <- lmdata_N$death_type != 'unknown'
  summary_table_lm['Manner of death N/A'] <- to_percentage(mean(!idx))
  summary_table_lm[c('Natural', 'Unnatural')] <-
    to_percentage((summary(lmdata_N$death_type[idx]) / sum(summary(lmdata_N$death_type[idx])))[c('natural', 'unnatural')])
  
  ### Language
  
  summary_table_filtered['Language'] <- ''
  summary_table_all['Language'] <- ''
  summary_table_lm['Language'] <- ''
  
  idx <- stats_N$anglo != 'unknown'
  summary_table_filtered['Language N/A'] <- to_percentage(mean(!idx))
  summary_table_filtered[c('Anglophone', 'Non-anglophone')] <- to_percentage((summary(stats_N$anglo[idx]) / sum(summary(stats_N$anglo[idx])))[c('anglo', 'non_anglo')])
  
  idx <- stats_all$anglo != 'unknown'
  summary_table_all['Language N/A'] <- to_percentage(mean(!idx))
  summary_table_all[c('Anglophone', 'Non-anglophone')] <- to_percentage((summary(stats_all$anglo[idx]) / sum(summary(stats_all$anglo[idx])))[c('anglo', 'non_anglo')])
  
  idx <- lmdata_N$anglo != 'unknown'
  summary_table_lm['Language N/A'] <- to_percentage(mean(!idx))
  summary_table_lm[c('Anglophone', 'Non-anglophone')] <- to_percentage((summary(lmdata_N$anglo[idx]) / sum(summary(lmdata_N$anglo[idx])))[c('anglo', 'non_anglo')])
  
  ### Notable type
  
  summary_table_filtered['Notability type'] <- ''
  summary_table_all['Notability type'] <- ''
  summary_table_lm['Notability type'] <- ''
  
  idx <- !is.na(stats_N$type_group)
  summ <- summary(stats_N$type_group[idx])[type_groups]
  summary_table_filtered['Notability type N/A'] <- to_percentage(mean(!idx))
  summary_table_filtered[names(summ)] <- to_percentage(summ / sum(summ))
  
  idx <- !is.na(stats_all$type_group)
  summ <- summary(stats_all$type_group[idx])[type_groups]
  summary_table_all['Notability type N/A'] <- to_percentage(mean(!idx))
  summary_table_all[names(summ)] <- to_percentage(summ / sum(summ))
  
  idx <- !is.na(lmdata_N$type_group)
  summ <- summary(lmdata_N$type_group[idx])[type_groups]
  summary_table_lm['Notability type N/A'] <- to_percentage(mean(!idx))
  summary_table_lm[names(summ)] <- to_percentage(summ / sum(summ))
  
  summary_table_filtered['Count'] <- nrow(stats_N)
  summary_table_all['Count'] <- nrow(stats_all)
  summary_table_lm['Count'] <- nrow(lmdata_N)
  
  # Return a table with stats for both "all" and "filtered" (i.e., those included in the study).
  data.frame(All=summary_table_all, Included=summary_table_filtered, Regression=summary_table_lm)
}

plot_num_art <- function(num_art, medium) {
  par(mar=c(6,5,2,2))
  idx <- which(names(num_art)==TWITTER_START_DATE):which(names(num_art)==MAX_DATE)
  plot(num_art[idx], axes=FALSE, log='y', type='p', xlab='', ylab='Number of documents',
       col=COL[[medium]], main=medium, ylim=c(5e4,1e8))
  axis(2)
  days <- DAYS[idx]
  ticks <- which(grepl('-01$', days))
  axis(1, at=ticks, labels=rep('', length(ticks)), col.ticks='gray')
  ticks <- which(grepl('-01-01$', days))
  axis(1, at=ticks, labels=days[ticks], las=2)
  # par(.default_par)
}

plot_time_series_for_small_multiples <- function(name, medium, X, x, num_art,
                                                 xlab=TRUE, ylab=TRUE) {
  if (SAVE_PLOTS) pdf(sprintf('%s/mention_curve_%s_%s.pdf', PLOTDIR, name, medium), width=2, height=2,
                      pointsize=8, family='Helvetica', useDingbats=FALSE)
  par(mar=c(3,3,1,1))
  mid <- wiki_to_mid[sprintf('<http://en.wikipedia.org/wiki/%s>', name)]
  ymin <- log10(1 / max(num_art, na.rm=TRUE))
  raw <- log10(x[mid, colnames(x) %in% -100:360])
  smoothed <- X[mid, colnames(X) %in% -100:360]
  plot(-100:360, raw, col=COL_LIGHT[[medium]],
       bty='n', xlab='', xlim=c(-100,360), ylim=c(ymin,-1), ylab='',
       panel.first=abline(v=0, col='gray'))
  text(360, -1, sprintf('%s', gsub('_', ' ', name)), adj=c(1,1), cex=1)
  lines(names(smoothed), smoothed, col=COL[[medium]], lwd=2)
  if (xlab) mtext('Days since death', side=1, line=2)
  if (ylab) mtext('Fraction mentioning docs [log10]', side=2, line=2)
  if (SAVE_PLOTS) dev.off()
}

plot_max_mem_hist <- function(medium) {
  if (SAVE_PLOTS) cairo_pdf(sprintf('%s/max_mem_hist_%s.pdf', PLOTDIR, medium), width=3.4, height=2.4,
                            pointsize=8, family='Helvetica')
  par(mar=c(3,3,1,3))
  if (medium == 'NEWS') {
    x <- x_N
  } else if (medium == 'TWITTER') {
    x <- x_T
  } else {
    stop("medium must be NEWS or TWITTER!")
  }
  t_max <- 100
  max_days <- apply(x[,colnames(x) %in% 0:t_max], 1, function(x) which.max(x) - 1)
  h <- hist(max_days, breaks=0:(t_max+1), right=FALSE, include.lowest=FALSE, xlab='', ylab='',
            main=NULL, axes=FALSE, col=COL[[medium]], border=NA, xlim=c(0,t_max+1), ylim=c(0,800))
  cum <- cumsum(h$counts)/sum(h$counts)
  ticks <- seq(0, t_max, 10)
  axis(1, at=ticks+0.5, labels=ticks, las=2)
  mtext('Days since death', side=1, line=2)
  axis(2, col=COL[[medium]], col.axis=COL[[medium]])
  mtext('Number of people', side=2, line=2, col=COL[[medium]])
  par(new=TRUE)
  plot((0:t_max)+0.5, cum, type='p', xlab='', ylab='', xlim=c(0,t_max+1), ylim=c(0,1), axes=FALSE,
       panel.first=abline(h=cum[30], v=29.5, col='gray', lty=2))
  axis(4)
  mtext('Cumulative rel. frequency', side=4, line=2)
  text(29.5, cum[30], labels=sprintf('%.1f%% had maximum mention\n frequency by day 29', 100*cum[30]),
       adj=c(-0.1,1.5), col='gray')
  if (SAVE_PLOTS) dev.off()
  return(list(hist=h, max_days=max_days))
}

make_df_for_fit <- function(medium) {
  if (medium == 'NEWS') {
    x <- x_N
  } else if (medium == 'TWITTER') {
    x <- x_T
  } else {
    stop("medium must be NEWS or TWITTER!")
  }
  
  t <- 1:400
  eps <- min(x[x>0], na.rm=TRUE)
  y <- 10^colMeans(log10(x[,colnames(x) %in% t] + eps), na.rm=TRUE)
  df <- data.frame(y=y, t=t)
  return(df)
}

biexp_fit <- function(medium) {
  df <- make_df_for_fit(medium)
  
  # First approximation: r = 0.
  # log(y) ~ log(N*exp(-p*t)) = log(N)-p*t
  model0 <- lm(log(y) ~ t, data=df)
  N0 <- exp(coef(model0)[1])
  p0 <- -coef(model0)[2]
  
  # Second approximation: q = 0.
  model1 <- nls(log(y) ~ log( N/(p+r) * (p*exp(-(p+r)*t) + r) ),
                start=list(p=p0, N=N0, r=0),
                data=df, control=list(maxiter=1e8), algorithm='port')
  N1 <- coef(model1)['N']
  p1 <- coef(model1)['p']
  r1 <- coef(model1)['r']
  
  # Full model.
  model <- nls(log(y) ~ log( N/(p-q+r) * ((p-q)*exp(-(p+r)*t) + r*exp(-q*t)) ),
               start=list(p=p1, N=N1, r=r1, q=0),
               data=df, control=list(maxiter=1e8), algorithm='port', lower=c(q=0))
  return(model)
}

exp_fit <- function(medium) {
  df <- make_df_for_fit(medium)
  # log(y) ~ log(a*exp(-b*t)) = log(a)-b*t
  model <- lm(log(y) ~ t, data=df)
  return(model)
}

exp_power_fit <- function(medium) {
  df <- make_df_for_fit(medium)
  # Start with a good guess.
  # log(y) ~ log(a*exp(b*t^(-1))) = log(a)+b*t^(-1)
  c0 <- -1
  model0 <- lm(log(y) ~ I(t^c0), data=df)
  log_a0 <- coef(model0)[1]
  b0 <- coef(model0)[2]
  model <- nls(log(y) ~ log_a + b*t^c,
               start=list(log_a=log_a0, b=b0, c=c0),
               data=df, control=list(maxiter=1e8), algorithm='port')
  
  return(model)
}

lognorm_fit <- function(medium) {
  df <- make_df_for_fit(medium)
  # log(y) ~ a + b*log(t) + c*log(t)^2
  model <- lm(log(y) ~ log(t) + I(log(t)^2), data=df)
  return(model)
}

lognorm_constrained_fit <- function(medium) {
  df <- make_df_for_fit(medium)
  df$combo <- log(df$t)^2 - 2*log(max(df$t))*log(df$t)
  # Requiring the vertex to be at the maximum time value introduces the constraint
  # b = -2c*log(t_max), which reduces the problem to the following:
  # log(y) ~ a + c*(log(t)^2 - 2*log(t_max)*log(t))
  model <- lm(log(y) ~ combo, data=df)
  return(model)
}

power_fit <- function(medium) {
  df <- make_df_for_fit(medium)
  # log(y) ~ a + b*log(t)
  model <- lm(log(y) ~ log(t), data=df)
  return(model)
}

log_fit <- function(medium) {
  df <- make_df_for_fit(medium)
  # y ~ a + b*log(t)
  model0 <- lm(y ~ log(t), data=df)
  a0 <- coef(model0)[1]
  b0 <- coef(model0)[2]
  # Necessary to avoid negative argument to log in nls fit (for TWITTER only).
  if (a0 + b0*log(max(df$t)) <= 0) a0 <- -b0*log(max(df$t)) + 1e-8
  # log(y) ~ log(a + b*log(t))
  model <- nls(log(y) ~ log(a + b*log(t)),
               # Without "/2", we fail to find an optimum in the case of TWITTER.
               start=list(a=a0/2, b=b0/2),
               data=df, control=list(maxiter=1e8))
  return(model)
}

hyperbolic_fit <- function(medium) {
  df <- make_df_for_fit(medium)
  # 1/y ~ a + b*t
  model0 <- lm(1/y ~ t, data=df)
  a0 <- coef(model0)[1]
  b0 <- coef(model0)[2]
  # log(y) ~ -log(a + b*t)
  model <- nls(log(y) ~ -log(a + b*t),
               start=list(a=a0, b=b0),
               data=df, control=list(maxiter=1e8), algorithm='port')
  return(model)
}

hyperbolic_power_fit <- function(medium) {
  df <- make_df_for_fit(medium)
  # Start with a good guess.
  # 1/y ~ a + b*t^c
  c0 <- -7
  model0 <- lm(1/y ~ I(t^c0), data=df)
  a0 <- coef(model0)[1]
  b0 <- coef(model0)[2]
  # log(y) ~ -log(a + b*t^c)
  model <- nls(log(y) ~ -log(a + b*t^c),
               start=list(a=a0, b=b0, c=c0),
               data=df, control=list(maxiter=1e9, warnOnly=TRUE), algorithm='port', trace=FALSE,
               lower=c(a=0, b=-Inf, c=-Inf), upper=c(a=Inf, b=0, c=0))
  return(model)
}

plot_avg_fraction_of_mentioning_docs <- function(medium, biexp_model=NULL) {
  if (SAVE_PLOTS) pdf(sprintf('%s/avg_mention_curve_%s.pdf', PLOTDIR, medium), width=3.4, height=3.4,
                      pointsize=8, family='Helvetica', useDingbats=FALSE)
  par(mar=c(3,3,1,1))
  if (medium == 'NEWS') {
    x <- x_N
  } else if (medium == 'TWITTER') {
    x <- x_T
  } else {
    stop("medium must be NEWS or TWITTER!")
  }
  max_day <- 400
  days <- -100:max_day
  eps <- min(x[x>0], na.rm=TRUE)

  # Log.
  y <- colMeans(log10(x[,colnames(x) %in% days] + eps), na.rm=TRUE)
  par(fig=c(0, 1, 0, 1))
  plot(days, y, type='p', bty='n', lwd=2, xlab='', ylab='', #xaxt='n', yaxt='n',
       panel.first=abline(v=0, col='gray', lwd=1, lty=2), col=COL[[medium]], las=0)
  mtext('Days since death', side=1, line=2)
  mtext('Fraction mentioning documents [log10]', side=2, line=2)
  y365 <- y['365']
  arrows(x0=325, x1=355, y0=y365, y1=y365, length=0.05)
  text(325, y365, '365 days', adj=c(1.1, 0.5))

  if (!is.null(biexp_model)) {
    biexp_coefs <- coef(biexp_model)
    N <- biexp_coefs['N']
    p <- biexp_coefs['p']
    r <- biexp_coefs['r']
    q <- biexp_coefs['q']

    t <- 1:max_day
    mem_comm <- N * exp(-(p+r)*t)
    mem_cult <- N/(p-q+r)*r * (exp(-q*t) - exp(-(p+r)*t))
    mem <- mem_comm + mem_cult
    
    lines(t, log10(mem_comm), lty=1, lwd=2, col=COL_GREEN)
    lines(t, log10(mem_cult), lty=1, lwd=2, col=COL_RED)
    lines(t, log10(mem), lty=1, lwd=2, col='black')

    legend('topright', bty='n',
           legend=c('Daily average', 'Biexponential fit S(t) = u(t) + v(t)',
                    'Communicative memory u(t)', 'Cultural memory v(t)'),
           seg.len=3,
           pch=c(1,NA,NA,NA),
           lty=c(0,1,1,1), #lty=c(0,1,4,2),
           col=c(COL[[medium]], 'black', COL_GREEN, COL_RED),
           lwd=c(2, 2, 2, 2))
    
    # Log-log.
    par(fig=c(0.35, 0.9, 0.27, 0.77), new=TRUE)
    y <- y[names(y) %in% t]
    # Here the day of death is shown as day 1 (due to log axes).
    plot(t, y, type='p', bty='n', xlab='', ylab='', lwd=2, col=COL[[medium]], las=0, log='x')
    lines(t, log10(mem_comm), lty=1, lwd=2, col=COL_GREEN)
    lines(t, log10(mem_cult), lty=1, lwd=2, col=COL_RED)
    lines(t, log10(mem), lty=1, lwd=2, col='black')
    legend('topright', bty='n', legend='Log x-scale', inset=c(0.1,0.2))
  }

  if (SAVE_PLOTS) dev.off()
}

plot_model_fit <- function(medium, name, model, predict, log='') {
  if (medium == 'NEWS') {
    x <- x_N
  } else if (medium == 'TWITTER') {
    x <- x_T
  } else {
    stop("medium must be NEWS or TWITTER!")
  }
  
  df <- make_df_for_fit(medium)
  t <- df$t
  y <- log10(df$y)
  yhat <- log10(predict)
  
  par(mar=c(3,3,1,1))
  plot(t, y, type='p', bty='n', lwd=2, xlab='', ylab='', col=COL[[medium]], las=0, log=log)
  mtext('Days since death', side=1, line=2, cex=0.6)
  mtext('Mention frequency [log10]', side=2, line=2, cex=0.6)
  lines(t, yhat, lwd=2)

  aic <- AIC(model)
  r2 <- cor(yhat, y)^2
  if (log=='x') text(400, max(y),
                      sprintf('R² = %.3f\nAIC = %.0f', r2, aic),
                      adj=c(1,1), cex=1)
  else text(200, max(y), name, adj=c(0.5,1), cex=1.4)
}

plot_all_model_fits <- function(medium) {
  models <- list(
                 "Exponential"=exp_fit(medium),
                 "Hyperbolic"=hyperbolic_fit(medium),
                 "Logarithmic"=log_fit(medium),
                 "Power"=power_fit(medium),
                 "Hyperbolic-power"=hyperbolic_power_fit(medium),
                 "Log-normal-inspired"=lognorm_constrained_fit(medium),
                 "Exponential-power"=exp_power_fit(medium),
                 "Biexponential"=biexp_fit(medium)
                 )
  if (SAVE_PLOTS) pdf(sprintf('%s/all_model_fits_%s.pdf', PLOTDIR, medium),
                      width=3.5, height=9, pointsize=8, family='Helvetica', useDingbats=FALSE)
  par(mfcol=c(8,2))
  for (log in c('', 'x')) {
    for (name in names(models)) {
      model <- models[[name]]
      plot_model_fit(medium, name, model, exp(predict(model)), log=log)
    }
  }
  if (SAVE_PLOTS) dev.off()
}

plot_comm_vs_cult_mem <- function(medium, biexp_coefs=NULL) {
  if (SAVE_PLOTS) pdf(sprintf('%s/comm_vs_cult_mem_%s.pdf', PLOTDIR, medium), width=3.4, height=2.4,
                      pointsize=8, family='Helvetica', useDingbats=FALSE)
  par(mar=c(4,4,1,1))
  if (medium == 'NEWS') {
    x <- x_N
  } else if (medium == 'TWITTER') {
    x <- x_T
  } else {
    stop("medium must be NEWS or TWITTER!")
  }
  max_day <- 40
  t <- seq(0, max_day-1, 0.01)
  N <- biexp_coefs['N']
  p <- biexp_coefs['p']
  r <- biexp_coefs['r']
  q <- biexp_coefs['q']
  u <- function(t, N, p, r, q) N * exp(-(p+r)*t)
  v <- function(t, N, p, r, q) N/(p-q+r)*r * (exp(-q*t) - exp(-(p+r)*t))
  mem_comm <- u(t, N, p, r, q)
  mem_cult <- v(t, N, p, r, q)
  ratio <- mem_cult / (mem_comm + mem_cult)
  plot(t+1, ratio, type='l', bty='n', col=COL[[medium]], lwd=2,
       xlab='Days since death', ylab='Fraction of cultural memory')
  for (perc in c(0.5, 0.99)) {
    idx <- min(which(ratio>=perc))
    points(t[idx]+1, ratio[idx], lwd=3, pch=20, col='black', cex=2)
    segments(-5, ratio[idx], t[idx]+1, ratio[idx], col='black', lty=2)
    segments(t[idx]+1, -5, t[idx]+1, ratio[idx], col='black', lty=2)
    text(t[idx]+1, ratio[idx], adj=c(-0.2, 1.2),
         labels=sprintf('%.0f%% after\n%.0f days', 100*perc, ceiling(t[idx]+1)))
  }
  if (SAVE_PLOTS) dev.off()
}

overlay_time_series <- function(X, medium) {
    matplot(-N:N, t(X), type='l', lty=1, lwd=2, col=COL_XLIGHT[[medium]], xlab='Days since death',
            ylab='Fraction mentioning docs [log10]', ylim=range(X), main=medium, bty='n')
}

smoothed_density <- function(x, bw=0.05, range=c(0,1)) {
  # Reflect the data to avoid drops at boundaries.
  left <- -(x-range[1]) + range[1]
  right <- left + 2*(range[2]-range[1])
  x <- c(left, x, right)
  est <- bkde(x, bandwidth=bw, range=range(x), gridsize=1000)
  # Consider only the original range.
  idx <- which(est$x >= range[1] & est$x <= range[2])
  est$x <- est$x[idx]
  # Renormalize.
  est$y <- 3*est$y[idx]
  est
}

# Smoothed histograms of curve properties for the different people groups.
plot_smoothed_densities_raw <- function(stats, medium, prop, groups, main=NULL) {
  nbw <- 20
  chars <- c('mean_before', 'peak_mean_boost', 'perm_boost', 'time_till_half')
  par(mfrow=c(length(groups), length(chars)), mar=c(4,4,2,2))
  for (char in chars) {
    # Find the extreme y-value to be plotted.
    ymin <- Inf
    ymax <- -Inf
    for (group in groups) {
      mask <- is.finite(stats[,prop]) & stats[,prop] %in% group
      x <- stats[mask,char]
      sm <- smoothed_density(x, bw=(max(x)-min(x))/nbw, range=range(x))$y
      ymin <- min(ymin, min(sm))
      ymax <- max(ymax, max(sm))
    }
    for (group in groups) {
      mask <- is.finite(stats[,prop]) & stats[,prop] %in% group
      x <- stats[mask,char]
      est <- smoothed_density(x, bw=(max(x)-min(x))/nbw, range=range(x))
      # The estimate without reflected data, that is it drops at the boundaries.
      est_ <- bkde(x, bandwidth=(max(x)-min(x))/nbw, range=range(x))
      plot(est$x, est$y, ylim=c(ymin,ymax), type='l', lwd=2,
           main=if (is.null(main)) group else main,
           col=COL[[medium]], xlab=FANCY_CURVE_CHAR_NAMES[char], ylab='Density', bty='n')
      rug(x)
    }
  }
  par(.default_par)
}

plot_chars_by_group <- function(stats, medium, prop, groups) {
  chars <- c('mean_before', 'peak_mean_boost', 'perm_boost', 'time_till_half')
  par(mfrow=c(1, length(chars)), mar=c(10,4,2,2))
  for (char in chars) {
    char_rank <- sprintf('%s_rank', char)
    grouped <- split(stats[,char_rank]/nrow(stats), prop)[as.character(groups)]
    boxplot(grouped, las=3, notch=TRUE, main=FANCY_CURVE_CHAR_NAMES[char],
            col=COL_LIGHT[[medium]], bty='n', ylab='Relative rank')
  }
  par(.default_par)
}

smoothed_2d_density <- function(x, bw) {
  # Reflect the data to avoid drops at boundaries.
  x <- rbind(cbind( -x[,1],  -x[,2]),
             cbind( -x[,1],   x[,2]),
             cbind( -x[,1], 2-x[,2]),
             cbind(  x[,1],  -x[,2]),
             cbind(  x[,1],   x[,2]),
             cbind(  x[,1], 2-x[,2]),
             cbind(2-x[,1],  -x[,2]),
             cbind(2-x[,1],   x[,2]),
             cbind(2-x[,1], 2-x[,2]))
  est <- bkde2D(x, range.x=list(c(-1,2), c(-1,2)), bandwidth=c(bw,bw), gridsize=c(153,153))
  # Consider only the original range [0,1].
  idx <- which(est$x1 >= 0 & est$x1 <= 1)
  est$x1 <- est$x1[idx]
  est$x2 <- est$x2[idx]
  # Renormalize.
  est$fhat <- 9*est$fhat[idx,idx]
  est
}

plot_smoothed_2d_densities <- function(stats, medium, char1, char2, prop, groups, main=NULL) {
  bw <- 0.1
  char1_rank <- sprintf('%s_rank', char1)
  char2_rank <- sprintf('%s_rank', char2)
  # Find the maximum y-value to be plotted.
  ymax <- -Inf
  for (group in groups) {
    mask <- is.finite(stats[,prop]) & stats[,prop] %in% group
    x <- cbind(stats[mask, char1_rank], stats[mask, char2_rank])/length(mask)
    ymax <- max(ymax, max(smoothed_2d_density(x, bw=bw)$fhat))
  }
  for (group in groups) {
    mask <- is.finite(stats[,prop]) & stats[,prop] %in% group
    x <- cbind(stats[mask, char1_rank], stats[mask, char2_rank])/length(mask)
    est <- smoothed_2d_density(x, bw=bw)
    filled.contour(est$x1, est$x2, est$fhat, zlim=c(0,ymax),
                   color=function(n) gray.colors(n,0,1),
                   plot.title=title(main=sprintf('%s\n(%s)', if (is.null(main)) group else main, medium),
                                    xlab=char1, ylab=char2))
    par(.default_par)
  }
}

cluster <- function(stats, feats) {
  input <- cbind(stats[,feats])
  colnames(input) <- feats
  # z-score normalize.
  input <- t(t(input) - colMeans(input))
  input <- t(t(input) / apply(input, 2, sd))
  K <- 2:30
  kmeans_all <- mclapply(K, function(k) { set.seed(1); kmeans(input, k, nstart=20) })
  # Reorder the clusters.
  for (k in K) {
    km <- kmeans_all[[which(K==k)]]
    ord <- order(km$size, decreasing=TRUE)
    ord_map <- 1:k; names(ord_map) <- ord
    km$size <- km$size[ord]
    km$withinss <- km$withinss[ord]
    km$centers <- km$centers[ord,]
    rownames(km$centers) <- 1:k
    km$cluster <- ord_map[as.character(km$cluster)]
    kmeans_all[[which(K==k)]] <- km
  }
  list(clustering=kmeans_all, diss=dist(input), K=K)
}

plot_sil_width <- function(medium, kmeans_all, diss) {
  avg_sil_width_kmeans <- unlist(lapply(kmeans_all, function(km) summary(silhouette(km$cluster, diss^2))$avg.width))
  plot(2:(length(kmeans_all)+1), avg_sil_width_kmeans, type='b', xlab='Number of clusters',
       ylab='Avg. silhouette width', main=medium, col=COL[[medium]], lwd=2, bty='n')
}

plot_clustered_time_series <- function(medium, kmeans, X) {
  k <- nrow(kmeans$centers)
  par(mfrow=c(1, k))
  for (j in 1:k) {
    matplot(-N:N, t(X[kmeans$cluster==j,]), type='l', lty=1, lwd=2, col=COL_XLIGHT[[medium]],
            xlab='Days since death', ylab='% mentioning docs [log10]', ylim=range(X),
            main=sprintf('%s cluster %s', medium, j))
  }
  par(.default_par)
}

boot_ci <- function(x, func, R=5000) {
  bo <- boot(x, statistic=function(d, i) func(d[i]), R=R)
  ci <- boot.ci(bo, conf=0.95, type="basic")$basic[4:5]
  if (is.null(ci)) {
    upper <- lower <- NA
  } else {
    lower <- ci[1]
    upper <- ci[2]
  } 
  unlist(list(upper=upper, point_est=func(x), lower=lower))
}

make_regression_data <- function(medium) {
  if (medium != 'NEWS' && medium != 'TWITTER') stop('Medium must be \'NEWS\' or \'TWITTER\'')
  stats <- if (medium=='NEWS') stats_N else stats_T
  data <- stats[stats$gender != ''
                & stats$death_type != 'unknown'
                & !is.na(stats$type_group) # This removes only 4 people.
                & !is.na(stats$age_group)
                & stats$age %in% 20:99,]
  data$age_group <- factor(data$age_group)
  # Mean (median) age is 71 (74), so use the age bracket 70-79 as the reference level.
  base_age_group  <- '70'
  base_death_type <- 'unknown'
  base_death_type <- 'natural'
  base_type_group <- 'art' # 'general fame'
  base_gender     <- 'Male'
  base_anglo      <- 'anglo'
  data$age_group  <- relevel(data$age_group, base_age_group)
  data$death_type <- relevel(data$death_type, base_death_type)
  data$type_group <- relevel(data$type_group, base_type_group)
  data$gender     <- relevel(data$gender, base_gender)
  data$anglo      <- relevel(data$anglo, base_anglo)
  N <- dim(data)[1]
  relranks <- (0:(N-1))/(N-1) - 0.5
  data$mean_before_relrank[order(data$mean_before_rank)] <- relranks
  data$peak_mean_boost_relrank[order(data$peak_mean_boost_rank)] <- relranks
  data$perm_boost_relrank[order(data$perm_boost_rank)] <- relranks
  return(data)
}

make_regression_data_T_vs_N <- function() {
  lmdata <- make_regression_data('NEWS')
  lmdata$mean_before_relrank_diff     <- lmdata_N$mean_before_relrank - lmdata_T$mean_before_relrank
  lmdata$peak_mean_boost_diff         <- lmdata_N$peak_mean_boost - lmdata_T$peak_mean_boost
  lmdata$perm_boost_diff              <- lmdata_N$perm_boost - lmdata_T$perm_boost
  lmdata$peak_mean_boost_relrank_diff <- lmdata_N$peak_mean_boost_relrank - lmdata_T$peak_mean_boost_relrank
  lmdata$perm_boost_relrank_diff      <- lmdata_N$perm_boost_relrank - lmdata_T$perm_boost_relrank
  return(lmdata)
}

# The real uncertainty in the summed coefficients is larger; need to add separate SEs and entries of vcov
# matrix: https://biologyforfun.wordpress.com/2017/05/17/adding-standard-errors-for-interaction-terms/comment-page-1/
compound_se_for_lm <- function(mod, interact=NULL, base_age=NULL) {
  X <- as.data.frame(vcov(mod))
  idx <- rownames(X)[grep('age_group..$', rownames(X))]
  X[sprintf('age_group%s:%s', base_age, interact),] <- 0
  X[,sprintf('age_group%s:%s', base_age, interact)] <- 0
  sapply(idx, function(i) {
    sub_idx <- c(i, if (is.null(interact)) NULL else c(interact, sprintf('%s:%s', i, interact)))
    sqrt(sum(X[sub_idx, sub_idx]))
    })
}

run_regression_analysis <- function(medium, ranks) {
  if (medium != 'NEWS' && medium != 'TWITTER') stop('Medium must be \'NEWS\' or \'TWITTER\'')
  lmdata <- if (medium == 'NEWS') lmdata_N else lmdata_T
  predictors <- 'mean_before_relrank + age_group + death_type + gender + anglo + type_group'
  suffix <- if (ranks) '_relrank' else ''
  mod_peak <- lm(as.formula(sprintf('peak_mean_boost%s ~ %s', suffix, predictors)), lmdata)
  mod_perm <- lm(as.formula(sprintf(     'perm_boost%s ~ %s', suffix, predictors)), lmdata)
  return(list(mod_peak=mod_peak, mod_perm=mod_perm))
}

run_regression_analysis_for_death_types <- function(medium, ranks) {
  if (medium != 'NEWS' && medium != 'TWITTER') stop('Medium must be \'NEWS\' or \'TWITTER\'')
  lmdata <- if (medium == 'NEWS') lmdata_N else lmdata_T
  base_death_type <- levels(lmdata$death_type)[1]
  # Compare natural vs. unnatural death for the different age brackets (0 to omit the
  # intercept and include it instead as the coefficient of the base age).
  predictors <- '0 + mean_before_relrank + age_group + death_type + gender + anglo + type_group + age_group:death_type'
  interact <- sprintf('death_type%s', if (base_death_type=='natural') 'unnatural' else 'natural')
  suffix <- if (ranks) '_relrank' else ''
  mod_peak <- lm(as.formula(sprintf('peak_mean_boost%s ~ %s', suffix, predictors)), lmdata)
  mod_perm <- lm(as.formula(sprintf(     'perm_boost%s ~ %s', suffix, predictors)), lmdata)
  return(list(mod_peak=mod_peak, mod_perm=mod_perm))
}

run_regression_analysis_T_vs_N <- function(ranks) {
  lmdata <- make_regression_data_T_vs_N()
  predictors <- 'mean_before_relrank_diff + age_group + death_type + gender + anglo + type_group'
  suffix <- if (ranks) '_relrank' else ''
  mod_peak <- lm(as.formula(sprintf('peak_mean_boost%s_diff ~ %s', suffix, predictors)), lmdata)
  mod_perm <- lm(as.formula(sprintf('perm_boost%s_diff ~ %s', suffix, predictors)), lmdata)
  return(list(mod_peak=mod_peak, mod_perm=mod_perm))
}

run_regression_analysis_T_vs_N_for_death_types <- function(ranks) {
  lmdata <- make_regression_data_T_vs_N()
  predictors <- '0 + mean_before_relrank_diff + age_group + death_type + gender + anglo + type_group + age_group:death_type'
  suffix <- if (ranks) '_relrank' else ''
  mod_peak <- lm(as.formula(sprintf('peak_mean_boost%s_diff ~ %s', suffix, predictors)), lmdata)
  mod_perm <- lm(as.formula(sprintf('perm_boost%s_diff ~ %s', suffix, predictors)), lmdata)
  return(list(mod_peak=mod_peak, mod_perm=mod_perm))
}

plot_age_coeffs_for_death_types <- function(mod, medium, outcome, base_age, ranks,
                                            ylim=NULL, col=COL[[medium]]) {
  summary <- coef(summary(mod))

  # Non-interaction terms (i.e., natural death).
  idx <- grep('^age_group..$', rownames(summary))
  decades <- as.numeric(sub('.*(\\d\\d).*', '\\1', rownames(summary))[idx])
  ord <- order(decades)
  decades <- decades[ord]
  age_coef_natural <- summary[idx,1][ord]
  se_natural <- compound_se_for_lm(mod)[ord]

  # Interaction terms (i.e., unnatural death).
  idx <- grep('^age_group..:death_typeunnatural$', rownames(summary))
  summary[idx,1] <- summary[idx,1] + summary['death_typeunnatural',1]
  rownames(summary)[rownames(summary)=='death_typeunnatural'] <- sprintf('age_group%s:death_typeunnatural', base_age)
  # Add age_group baseline coefs to interaction coefs.
  idx <- grep('^age_group..:death_typeunnatural$', rownames(summary))
  summary[idx,1] <- summary[idx,1] + summary[grep('age_group..$', rownames(summary)), 1]
  age_coef_unnatural <- summary[idx,1][ord]
  se_unnatural <- compound_se_for_lm(mod, 'death_typeunnatural', base_age)[ord]

  if (is.null(ylim)) ylim <- range(c(age_coef_natural - 2*se_natural,
                                     age_coef_natural + 2*se_natural,
                                     age_coef_unnatural - 2*se_unnatural,
                                     age_coef_unnatural + 2*se_unnatural), na.rm=TRUE)
  col_unnat <- 'gray'
  suffix <- if (ranks) '_relrank' else ''
  if (outcome == 'peak_mean_boost') {
    main <- sprintf('Short-term boost', medium)
    xlab <- 'Average boost'
    outfile <- sprintf('%s/age_coefs_%s_%s%s.pdf', PLOTDIR, medium, outcome, suffix)
  } else if (outcome == 'perm_boost') {
    main <- sprintf('Long-term boost', medium)
    xlab <- 'Average boost'
    outfile <- sprintf('%s/age_coefs_%s_%s%s.pdf', PLOTDIR, medium, outcome, suffix)
  } else if (outcome == 'peak_mean_boost_diff') {
    main <- 'Short-term boost'
    xlab <- 'Average boost difference'
    outfile <- sprintf('%s/age_coefs_NEWS-VS_TWITTER_%s%s.pdf', PLOTDIR, outcome, suffix)
  } else if (outcome == 'perm_boost_diff') {
    main <- 'Long-term boost'
    xlab <- 'Average boost difference'
    outfile <- sprintf('%s/age_coefs_NEWS-VS_TWITTER_%s%s.pdf', PLOTDIR, outcome, suffix)
  } else {
    stop('Illegal outcome name')
  }
  
  if (SAVE_PLOTS) cairo_pdf(outfile, width=3.4, height=2, pointsize=8, family='Helvetica')
  par(mar=c(3,3,1,1))
  plot(decades+6, age_coef_unnatural, type='b', panel.first=abline(h=0, col='gray'),
       ylim=ylim, xlim=c(min(decades), max(decades)+10), bty='n', xaxt='n', yaxt='n',
       main=main, xlab='', ylab='', lwd=2, col=col_unnat)
  dispersion(decades+6, age_coef_unnatural, 2*se_unnatural, col=col_unnat, arrow.cap=0.005)
  axis(1, at=c(decades, max(decades)+10), line=0)
  axis(2, line=0)
  mtext('Age at death', 1, line=2)
  mtext(xlab, 2, line=2)
  points(decades+4, age_coef_natural, type='b', lwd=2, col=col)
  dispersion(decades+4, age_coef_natural, 2*se_natural, col=col, arrow.cap=0.005)
  legend('bottomleft', c('Natural death', 'Unnatural death'), col=c(col, col_unnat), lty=1, lwd=2, horiz=TRUE, bty='n')
  if (SAVE_PLOTS) dev.off()
}

print_regression_table <- function(mods_N, mods_T, ranks) {
  age_names <- sprintf("Age: %d--%d", seq(20,90,10)[-6], seq(29,99,10)[-6])
  type_names <- levels(lmdata_N$type_group)[-1]
  table <- texreg(list(mods_N$mod_peak, mods_T$mod_peak, mods_N$mod_perm, mods_T$mod_perm), dcolumn=TRUE, booktabs=TRUE,
                  use.packages=FALSE, digits=3, leading.zero=FALSE, single.row=TRUE, stars = c(0.001,0.01,0.05),
                  custom.model.names=c("\\shortstack{Short-term boost\\\\(News)}",
                                       "\\shortstack{Short-term boost\\\\(Twitter)}",
                                       "\\shortstack{Long-term boost\\\\(News)}",
                                       "\\shortstack{Long-term boost\\\\(Twitter)}"),
                  custom.coef.names=c("(Intercept)", "Pre-mortem mean (relative rank)", age_names, "Manner of death: unnatural",
                                      "Gender: female", paste("Language:", c("non-anglophone", "unknown")),
                                      paste("Notability type:", type_names)),
                  reorder.coef=c(1,2,10,12:13,11,14:18,3:9), table=FALSE)
  cat(table, file=sprintf('%s/lm_coef_T_and_N%s.tex', TABLEDIR, if (ranks) '_relrank' else ''), sep="\n")
}

print_regression_table_T_vs_N <- function(mods, ranks) {
  age_names <- sprintf("Age: %d--%d", seq(20,90,10)[-6], seq(29,99,10)[-6])
  type_names <- levels(lmdata_N$type_group)[-1]
  table <- texreg(list(mods$mod_peak, mods$mod_perm), dcolumn=TRUE, booktabs=TRUE,
                  use.packages=FALSE, digits=3, leading.zero=FALSE, single.row=TRUE, stars = c(0.001,0.01,0.05),
                  label = "table:coefficients News vs. Twitter", custom.model.names=c("Short-term boost", "Long-term boost"),
                  custom.coef.names=c("(Intercept)", "Pre-mortem mean (relative-rank diff.)", age_names,
                                      "Manner of death: unnatural",
                                      "Gender: female", paste("Language:", c("non-anglophone", "unknown")),
                                      paste("Notability type:", type_names)),
                  reorder.coef=c(1,2,10,12:13,11,14:18,3:9), table=FALSE)
  cat(table, file=sprintf('%s/lm_coef_T_vs_N%s.tex', TABLEDIR, if (ranks) '_relrank' else ''), sep="\n")
}

print_model_description <- function(medium, depvar, ranks) {
  cat('\n\n#####################################################\n')
  cat('REGRESSION MODEL\n')
  cat(sprintf('Medium: %s\n', medium))
  cat(sprintf('Dependent variable: %s\n', depvar))
  cat(sprintf('Transformation on dependent variable: %s\n', if (ranks) 'RELATIVE RANKS' else 'NONE'))
  cat('#####################################################\n')
}

load_doc_length_regression_data <- function() {
  file <- sprintf('%s/RData/log_doc_length_ratios.RData', DATADIR)
  if (!file.exists(file)) {
    # Note: this uses all non-Twitter documents, not only those from the news domains. Also,
    # it uses only a set of 2638 people that were used in an early stage of the project
    # (meeting the criterion "min_num_active_days=50"), which are then reduced to the subset
    # of 579 people that are also used in the main regression analysis about post-mortem mention
    # frequency.
    doc_len_all <- read.table(pipe(sprintf('gunzip -c %s/mean_doc_length_per_day.tsv.gz', DATADIR)),
                              sep='\t', as.is=TRUE, col.names=c('mid', 'date', 'numWords'))
    ok_mids <- intersect(unique(doc_len_all$mid), unique(rownames(lmdata_N)))
    doc_len <- doc_len_all[doc_len_all$mid %in% ok_mids,]
    doc_len$rel_date <- as.numeric(as.Date(doc_len$date) - as.Date(death_dates[doc_len$mid]))
    feature <- 'numWords'
    
    x <- do.call(rbind, mclapply(split(doc_len, doc_len$mid), function(l) {
      dod <- as.Date(death_dates[l$mid[1]])
      offset <- as.numeric(MAX_DATE-MIN_DATE) + 1
      y <- rep(NA, NRELDATES)
      # Log.
      y[l$rel_date + offset] <- log(l[,feature])
      y[as.numeric(as.Date(EMPTY_DAYS)-dod) + offset] <- NA
      return(y)
    }, mc.cores=6))
    extreme <- (NRELDATES-1)/2
    colnames(x) <- -extreme:extreme
    
    x <- x[, colnames(x) %in% -100:N]
    premeans <- rowMeans(x[,colnames(x) <= -30], na.rm=TRUE)
    x_norm <- x - premeans
    regdata <- cbind(lmdata_N[rownames(x_norm),], x_norm)
    save(regdata, file=file)
  } else {
    load(file)
  }
  return(regdata)
}

plot_doc_length_increase <- function(regdata, predictors, ylab=NULL) {
  coefs <- do.call(rbind,
                   lapply(-100:N,
                          function(t) summary(lm(as.formula(sprintf('`%s` ~ %s', t, predictors)),
                                                 regdata))$coefficients[1,1:2]))
  est <- coefs[,1]
  lo <- coefs[,1]-2*coefs[,2]
  hi <- coefs[,1]+2*coefs[,2]

  # Natural deaths.
  plot(-100:N, exp(est)-1, type='l', lwd=2, col='#000000', bty='n', axes=FALSE,
       ylim=c(-0.7,0.7),xlab='Days since death', ylab=ylab,
       main='Mean relative document length increase', cex.main=1,
       panel.first=c(abline(v=0, h=0, col='gray'), #abline(v=c(30), col='red'),
                     polygon(x=c(-100:N, rev(-100:N)), y=c(lo, rev(hi)), col='#00000040', border=NA),
                     axis(1, at=seq(-90,360,30), las=2), axis(2)))

  # Unnatural deaths.
  # predictors <- 'mean_before_relrank + age_group + death_type + anglo'
  # coefs <- do.call(rbind,
  #                  lapply(-100:N,
  #                         function(t) sum(summary(lm(as.formula(sprintf('`%s` ~ %s', t, predictors)),
  #                                                regdata))$coefficients[c(1,10),1])))
  # plot(-100:N, exp(coefs[,1])-1, type='l', lwd=2, col='#000000', bty='n', axes=FALSE, ylim=c(-0.7,0.7),
  #      panel.first=c(abline(v=0, h=0, col='gray'), axis(1, at=seq(-90,360,30), las=2), axis(2)))
}

draw_mega_figure <- function() {
  days <- -100:360
  feat_cols <- c(mean_before=COL_DARKBLUE, peak_mean_boost=COL_MAGENTA, perm_boost=COL_GREEN, time_till_half=COL_RED)
  bg <- function(col) rect(par("usr")[1], par("usr")[3], par("usr")[2], par("usr")[4], col=col, border=NA)
  mar <- 0.2
  
  if (SAVE_PLOTS) cairo_pdf(sprintf('%s/megafigure.pdf', PLOTDIR), width=8, height=8, pointsize=6, family='Helvetica')
  
  par(.default_par)
  plot.new()
  rel_line_height <- par('mai')[1]/par('mar')[1]/par('din')
  par(fig=c(0.59, 0.91, 0.09, 0.41), new=TRUE)
  par(mar=c(0,0,0,0))
  confusion <- table(as.factor(km_N$cluster), as.factor(km_T$cluster))
  palmat <- color.scale(confusion^0.5, cs1=c(1,1), cs2=c(1,0.2), cs3=c(1,0.2))
  # Must fix the function definition of color2D.matplot: do fix(color2D.matplot) and replace "f" with "d" on line 101:
  # https://stackoverflow.com/questions/44522895/plot-color-matrix-with-numbers-in-r
  color2D.matplot(confusion, cellcolors=palmat, main='', show.values=1, vcol=rgb(0,0,0), axes=FALSE, vcex=1.2)
  feats <- c('mean_before', 'peak_mean_boost', 'perm_boost', 'time_till_half')
  # bar_range <- 1.1 * range(c(km_N$centers, km_N$centers))
  bar_range <- c(-2.1, 5.2)
  for (medium in c('NEWS', 'TWITTER')) {
    if (medium == 'NEWS') {
      kmeans <- km_N
      X <- X_N
      x <- x_N
      fig <- function(j) c(0.51, 0.59, (4-j)*0.08+0.09, (5-j)*0.08+0.09)
      fig_incr <- c(mar*rel_line_height[1], mar*rel_line_height[1], 0, 0)
      fig_bar <- function(j) c(0.91, 0.99, (4-j)*0.08+0.09, (5-j)*0.08+0.09)
    } else {
      kmeans <- km_T
      X <- X_T
      x <- x_T
      fig <- function(j) c(j*0.08+0.51, (j+1)*0.08+0.51, 0.41, 0.49)
      fig_incr <- c(0, 0, -mar*rel_line_height[2], -mar*rel_line_height[2])
      fig_bar <- function(j) c(j*0.08+0.51, (j+1)*0.08+0.51, 0.01, 0.09)
    }
    cluster_sizes <- kmeans$size
    y <- do.call(cbind, lapply(1:4, function(j) colMeans(X[kmeans$cluster==j, colnames(X) %in% days], na.rm=TRUE)))
    for (j in 1:4) {
      # bar_range <- 1.1 * range(kmeans$centers)
      par(fig=fig(j)-fig_incr, new=TRUE, mar=c(mar, mar, mar, mar))
      plot(days, y[,j], type='l', ylim=range(y), las=2, col=COL[[medium]], lwd=2, xlab='', ylab='', xaxt='n', yaxt='n',
           bty='n', panel.first=bg("#eeeeee"))
      text(max(days), max(y), sprintf('C%d', j), adj=c(1,1), cex=1)
      par(fig=fig_bar(j)+fig_incr, new=TRUE, mar=c(mar, mar, mar, mar))
      barplot(kmeans$centers[j,], main='', ylab='', col=feat_cols, names.arg='', ylim=bar_range, yaxt='n', border=NA,
              bty='n', panel.first=bg("#eeeeee"))
      if (medium=='TWITTER' && j==4) {
        mtext('z-scores', side=4, line=1.8, cex=0.5)
        axis(4, at=seq(-2,5), labels=c('-2','','0','','2','','4',''), cex.axis=0.5, cex.lab=0.5,
                                          lwd=0.8, lwd.ticks=0.8, las=1, hadj=0)
      }
      text(par("usr")[2]*0.95, par("usr")[4]*0.95, adj=c(1,1), cex=0.5,
           sprintf('N=%d\n(%.0f%%)', kmeans$size[j], 100*kmeans$size[j]/sum(kmeans$size)))
    }
  }
  ### Matrix row label.
  par(fig=c(0.50, 1, 0.50, 0.55), new=TRUE, mar=c(0,0,0,0))
  plot(NULL, xaxt='n', yaxt='n', bty='n', ylab='', xlab='', xlim=0:1, ylim=0:1)
  text(0.5, 0, 'Twitter', adj=c(0.5,0), col=COL[['TWITTER']], cex=1.2)
  ### Matrix column label.
  par(fig=c(0.40, 0.59, 0.42, 0.50), new=TRUE, mar=c(0,0,0,0))
  plot(NULL, xaxt='n', yaxt='n', bty='n', ylab='', xlab='', xlim=0:1, ylim=0:1)
  text(1, 0, 'News', adj=c(1,0), col=COL[['NEWS']], cex=1.2)
  
  ### Barplot legend.
  par(fig=c(0.45, 0.59, 0.01, 0.09), new=TRUE)
  plot(NULL, xaxt='n', yaxt='n', bty='n', ylab='', xlab='', xlim=0:1, ylim=0:1)
  legend("bottomleft", legend=c(' pre-mortem mean', ' short-term boost', ' long-term boost', ' halving time'),
         pch=15, pt.cex=3, cex=0.7, y.intersp=1.7,  bty='n', col=feat_cols, text.col=feat_cols)
  
  for (i in 3:1) {
    delta <- 0.007
    par(fig=c(i*delta, 0.4+i*delta, i*delta, 0.4+i*delta), new=TRUE, mar=c(5,5,0,0))
    plot(NULL, xaxt='n', yaxt='n', xlab='', ylab='', panel.first=bg('white'), xlim=0:1, ylim=0:1)
  }
  par(fig=c(0, 0.50, 0.43, 0.59), new=TRUE, mar=c(0,0,0,0))
  plot(NULL, xaxt='n', yaxt='n', bty='n', ylab='', xlab='', xlim=0:1, ylim=0:1)
  text(0.5, 0, sprintf('Mention time series (N=%d)', dim(X)[1]), adj=c(0.5,0), cex=0.8)
  
  par(fig=c(0, 0.40, 0, 0.40), new=TRUE, mar=c(5,5,0,0))
  name <- 'Whitney_Houston'
  # name <- 'Donna_Summer'
  # name <- 'Amy_Winehouse'
  medium <- 'NEWS'
  if (medium == 'NEWS') {
    X <- X_N
    x <- x_N
    num_art <- num_art_N
  } else {
    X <- X_T
    x <- x_T
    num_art <- num_art_T
  }
  cex <- 0.65 # 0.85
  mid <- wiki_to_mid[sprintf('<http://en.wikipedia.org/wiki/%s>', name)]
  raw <- log10(x[mid, colnames(x) %in% days])
  smoothed <- X[mid, colnames(X) %in% days]
  ymin <- min(raw[is.finite(raw)], na.rm=TRUE)
  col <- as.vector(col2rgb(COL[[medium]])/255)
  col <- rgb(col[1], col[2], col[3], 0.2)
  plot(days, raw, col=col, main='', pch=20, cex=2, cex.axis=0.7, cex.lab=0.7,
       bty='o', xlab='Days since death', ylab='Fraction mentioning documents [log10]', ylim=c(ymin,-1),
       panel.first=c(bg('white'), abline(v=0, col='gray', lty=2)))
  lines(names(smoothed), smoothed, col=COL[[medium]], lwd=2)
  text(max(days), -1, sprintf('%s', gsub('_', ' ', name)), adj=c(1,1), cex=1) # cex=1.5)
  y_before <- stats_N[mid, 'mean_before']
  y_peak <- y_before + stats_N[mid, 'peak_mean_boost']
  y_perm <- y_before + stats_N[mid, 'perm_boost']
  x_tth <- stats_N[mid, 'time_till_half'] * max(as.numeric(colnames(X)))
  y_tth <- mean(c(y_peak, y_perm))
  # mean before
  segments(x0=min(days), y0=y_before, x1=-30, lwd=2, col=feat_cols['mean_before'])
  text(-60, y_before, 'pre-mortem\nmean', srt=0, adj=c(0.5,1.5), col=feat_cols['mean_before'], cex=cex)
  # peak
  segments(x0=0, y0=y_peak, x1=30)
  arrows(x0=-40, y0=y_before, y1=y_peak, length=0.05, code=3, lwd=2, col=feat_cols['peak_mean_boost'])
  text(-70, mean(c(y_before, y_peak)), 'short-\nterm\nboost', srt=0, adj=c(0.5,0.5), col=feat_cols['peak_mean_boost'], cex=cex)
  # perm
  segments(x0=30, y0=y_perm, x1=max(days))
  arrows(x0=100, y0=y_before, y1=y_perm, length=0.05, code=3, lwd=2, col=feat_cols['perm_boost'])
  text(110, mean(c(y_before, y_perm)), 'long-term boost', srt=0, adj=c(0,0.5), col=feat_cols['perm_boost'], cex=cex)
  # tth
  arrows(x0=1, y0=y_tth, x1=x_tth, length=0.05, code=3, lwd=2, col=feat_cols['time_till_half'])
  text(x_tth+10, y_tth, 'halving time', srt=0, adj=c(0,0.5), col=feat_cols['time_till_half'], cex=cex)
  
  ### Arrow from example to cluster.
  par(fig=c(0.40, 0.51, 0, 0.45), new=TRUE, mar=c(0,0,0,0))
  plot(NULL, xaxt='n', yaxt='n', bty='n', xlab='', ylab='', xlim=0:1, ylim=0:1)
  col <- 'gray50'
  arrows(x0=0, y0=0.5, x1=0.5, length=0.05, code=0, lwd=5, col=col, ljoin=1, lend=2)
  arrows(x0=0.5, y0=0.5, y1=0.40, length=0.05, code=0, lwd=5, col=col, ljoin=1, lend=2)
  arrows(x0=0.5, y0=0.40, x1=0.98, length=0.2, code=2, lwd=5, col=col, ljoin=1, lend=1)
  text(0.50, 0.53, 'Clustering', srt=0, adj=c(0,0.5), col=col, cex=1.5, font=2, srt=90)
  
  if (SAVE_PLOTS) dev.off()
}

num_art_N <- get_num_art('NEWS')
num_art_T <- get_num_art('TWITTER')

data_N <- get_mention_freq_table('NEWS')
data_T <- get_mention_freq_table('TWITTER')

x_N <- get_rel_date_matrix('NEWS', data_N, num_art_N, CHUNK_SIZE)
x_T <- get_rel_date_matrix('TWITTER', data_T, num_art_T, CHUNK_SIZE)

mids_N <- filter_people('NEWS', x_N)
mids_T <- filter_people('TWITTER', x_T)
mids <- intersect(mids_N, mids_T)

x_N <- x_N[mids,]
x_T <- x_T[mids,]

X_N <- normalize_and_smooth('NEWS', x_N, num_art_N, mean_center=FALSE)
X_T <- normalize_and_smooth('TWITTER', x_T, num_art_T, mean_center=FALSE)

stats_all <- load_stats_all()

stats_N <- load_stats('NEWS', x_N, X_N, num_art_N)
stats_T <- load_stats('TWITTER', x_T, X_T, num_art_T)

lmdata_N <- make_regression_data('NEWS')
lmdata_T <- make_regression_data('TWITTER')

BASE_AGE <- levels(lmdata_N$age_group)[1]

# stats_N and stats_T contain the same people, so we arbitrarily use stats_N here.
natural <- which(stats_N$natural_death == 1)
unnatural <- which(stats_N$unnatural_death == 1)
# Are there any people with ambiguous causes of death? -- No!
length(intersect(natural, unnatural))

cluster_feats <- c('mean_before', 'peak_mean_boost', 'perm_boost', 'time_till_half')
cluster_result_N <- cluster(stats_N, cluster_feats)
cluster_result_T <- cluster(stats_T, cluster_feats)
k <- 4
km_N <- cluster_result_N$clustering[[which(cluster_result_N$K==k)]]
km_T <- cluster_result_T$clustering[[which(cluster_result_T$K==k)]]

Number of documents per day

plot_num_art(num_art_N, 'NEWS')
plot_num_art(num_art_T, 'TWITTER')

Figure 1: Number of documents per day in the Spinn3r corpus.

Average daily number of documents by year

News:

tapply(num_art_N, substring(names(num_art_N),0,4), function(x) mean(x, na.rm=TRUE))
##     2008     2009     2010     2011     2012     2013     2014 
## 118874.4 269481.4 359729.0 222132.4 201000.3 169738.4 167931.8

Twitter:

tapply(num_art_T, substring(names(num_art_T),0,4), function(x) mean(x, na.rm=TRUE))
##         2008         2009         2010         2011         2012         2013 
##     1071.196  1097066.229  6704034.431 19725105.305 27667683.221 50809628.488 
##         2014 
## 36663198.418

Taxonomy of causes of death

Natural causes of death

Acute myeloid leukemia, Adrenocortical carcinoma, Alveolar rhabdomyosarcoma, Alzheimer’s disease, Amyloidosis, Amyotrophic lateral sclerosis, Anemia, Aneurysm, Aortic aneurysm, Aortic dissection, Appendix cancer, Asthma, Astrocytoma, Atherosclerosis, Atypical teratoid rhabdoid tumor, B-cell chronic lymphocytic leukemia, Bladder cancer, Bleeding, Blood disorder, Blunt trauma, Bone cancer, Bone tumor, Brain Cancer, Brain damage, Brain tumor, Breast cancer, Bronchitis, Bronchopneumonia, Cancer, Cardiac arrest, Cardiac dysrhythmia, Cardiac surgery, Cardiopulmonary Arrest, Cardiovascular disease, Cerebral hemorrhage, Cerebral infarction, Cervical cancer, Cholangiocarcinoma, Chronic kidney disease, Chronic Obstructive Pulmonary Disease, Cirrhosis, Colorectal cancer, Complication, Complications from a stroke, Complications from cardiac surgery, Complications from pneumonia, Complications of diabetes mellitus, Congenital heart defect, Coronary artery disease, Craniocerebral Trauma, Creutzfeldt–Jakob disease, Cystic fibrosis, Dementia, Dementia with Lewy bodies, Diabetes mellitus, Disease, Ebola virus disease, Emphysema, Epileptic seizure, Esophageal cancer, Gallbladder cancer, Glioblastoma multiforme, Heart Ailment, Heart failure, Heart valve disease, Heat Stroke, Hepatitis, HIV/AIDS, Hodgkin’s lymphoma, Huntington’s disease, Hypertension, Hypertensive heart disease, Hyperthermia, Illness, Infection, Influenza, Internal bleeding, Intracranial aneurysm, Intracranial hemorrhage, Kidney cancer, Laryngeal cancer, Leiomyosarcoma, Leukemia, Liver cancer, Liver disease, Liver failure, Liver tumour, Lung cancer, Lung disease, Lung Infection, Lymphoma, Malaria, Melanoma, Meningitis, Mesothelioma, Metastatic breast cancer, Metastatic Melanoma, Motor neuron disease, Multiple myeloma, Multiple organ dysfunction syndrome, Multiple organ failure, Multiple sclerosis, Multiple system atrophy, Myelodysplastic syndrome, Myocardial infarction, Natural causes, Nephropathy, Non-Hodgkin lymphoma, Old age, Oral cancer, Organ dysfunction, Ovarian cancer, Pancreatic cancer, Pancreatitis, Parkinson’s disease, Peritonitis, Pneumonia, Pneumothorax, Polycythemia, Polymyalgia rheumatica, Progressive supranuclear palsy, prolonged illness, Prostate cancer, Pulmonary edema, Pulmonary embolism, Pulmonary failure, Pulmonary fibrosis, Pyelonephritis, Renal failure, Respiratory arrest, Respiratory disease, Respiratory failure, Salivary gland neoplasm, Sepsis, Septic shock, Skin cancer, Smallpox, Squamous-cell carcinoma, Stomach cancer, Stroke, Subarachnoid hemorrhage, Subdural hematoma, Surgery, Surgical complications, T-Cell Lymphoma, Terminal illness, Throat Cancer, Thrombosis, Thrombus, Thyroid cancer, Urinary tract infection, Uterine cancer, Vascular dementia, Viral pneumonia

Unnatural causes of death

Accident, Accidental drug overdose, Accidental fall, Airstrike, Alcohol intoxication, Ambush, Asphyxia, Assassination, Assisted suicide, Aviation accident or incident, Ballistic trauma, Bike accident, Blast injury, Boating Accident, Bomb, Brain injury, Capital punishment, Car bomb, Carbon monoxide poisoning, Casualty of war, Cocaine overdose, Combined drug intoxication, Decapitation, Drowning, Drug overdose, Execution, Execution by firing squad, Execution-style murder, Explosion, Falling, Falling from height, Fire, Firearm, Gunshot, Hanging, Helicopter crash, Heroin overdose, Hit and run, Homicide, Improvised bombing, Injury, Killed in action, Lethal injection, Lightning, Major trauma, Motorcycle accident, Mountaineering, Murder, Murder_suicide, Murder–suicide, Poison, Poisoning, Racing Accident, Self-inflicted wound, Shark attack, Shootout, Skiing accident, Smoke inhalation, Stab wound, Stabbing, Strangling, Struck by car, Suicide, Suicide attack, Suicide by hanging, Tornado Incident, Torture, Traffic collision, Traumatic brain injury

Taxonomy of notability types

Biographic statistics of public figures

The first column represents the set of all 33340 Freebase people who died in the same time period studied (11 June 2009 to 30 September 2014). The second column represents the 2362 people studied. The third column represents the subset of the 870 people included in the regression analysis.

(A note on age: among all people in Freebase, there are some obvious age errors [e.g., 610 years]. Even when ignoring all ages above 120, the mean changes only slightly, dropping from 76.26 to 76.22.)

table <- make_bio_table()
table
##                        All Included Regression
## Age                                           
## Age N/A                10%       6%         0%
## 1st quartile            68       64         60
## Mean                    76       74         70
## Median                  80       77         73
## 3rd quartile            88       87         84
## Gender                                        
## Gender N/A             27%       7%         0%
## Female                 16%      17%        17%
## Male                   84%      83%        83%
## Manner of death                               
## Manner of death N/A    76%      60%         0%
## Natural                85%      86%        88%
## Unnatural              15%      14%        12%
## Language                                      
## Language N/A           45%      27%        14%
## Anglophone             60%      82%        80%
## Non-anglophone         40%      18%        20%
## Notability type                               
## Notability type N/A     1%       0%         0%
## art                    40%      50%        56%
## sports                 14%      14%        14%
## leadership             11%      14%        13%
## known for death        26%      16%        10%
## general fame            7%       4%         5%
## academia/engineering    2%       2%         2%
## Count                33340     2362        870

Table 1: Biographic statistics of public figures.

Examples of individual mention time series

(a) News

plot_time_series_for_small_multiples('Benoit_Mandelbrot', 'NEWS', X_N, x_N, num_art_N, ylab=TRUE)
plot_time_series_for_small_multiples('Kashiram_Rana', 'NEWS', X_N, x_N, num_art_N, ylab=FALSE)
plot_time_series_for_small_multiples('Amy_Winehouse', 'NEWS', X_N, x_N, num_art_N, ylab=FALSE)
plot_time_series_for_small_multiples('George_McGovern', 'NEWS', X_N, x_N, num_art_N, ylab=FALSE)

(b) Twitter

plot_time_series_for_small_multiples('Benoit_Mandelbrot', 'TWITTER', X_T, x_T, num_art_T, ylab=TRUE)
plot_time_series_for_small_multiples('Kashiram_Rana', 'TWITTER', X_T, x_T, num_art_T, ylab=FALSE)
plot_time_series_for_small_multiples('Amy_Winehouse', 'TWITTER', X_T, x_T, num_art_T, ylab=FALSE)
plot_time_series_for_small_multiples('George_McGovern', 'TWITTER', X_T, x_T, num_art_T, ylab=FALSE)

Figure 2: Examples of mention time series for four deceased public figures.

Overlay of all mention time series

overlay_time_series(X_N, 'NEWS')
overlay_time_series(X_T, 'TWITTER')

Figure 3: Overlay of all mention time series.

Frequency of post-mortem peak days

Histogram of post-mortem peak days

mmh_N <- plot_max_mem_hist('NEWS')
mmh_T <- plot_max_mem_hist('TWITTER')

Figure 4: Histogram of post-mortem peak days. Left: news. Right: Twitter. For each post-mortem day we count for how many individuals the mention frequency is largest on that day, out of the 100 days immediately following death. For most people, the maximum mention frequency is observed on the day after death.

Relative frequencies of peak days

The first entry of each vector corresponds to day 0 (i.e., the day of death).

h_N <- mmh_N$hist
h_T <- mmh_T$hist
max_days_N <- mmh_N$max_days
max_days_T <- mmh_T$max_days

probs_N <- (h_N$counts/sum(h_N$counts))[1:30]
probs_T <- (h_T$counts/sum(h_T$counts))[1:30]

News:

probs_N
##  [1] 0.2167654530 0.2933954276 0.1109229467 0.0444538527 0.0309060119
##  [6] 0.0258255715 0.0207451312 0.0114309907 0.0135478408 0.0101608806
## [11] 0.0076206605 0.0063505504 0.0050804403 0.0042337003 0.0038103302
## [16] 0.0033869602 0.0063505504 0.0008467401 0.0029635902 0.0029635902
## [21] 0.0046570703 0.0033869602 0.0021168501 0.0033869602 0.0042337003
## [26] 0.0021168501 0.0033869602 0.0012701101 0.0029635902 0.0025402202

Twitter:

probs_T
##  [1] 0.2878916173 0.3302286198 0.0999153260 0.0482641829 0.0325994920
##  [6] 0.0169348010 0.0156646909 0.0084674005 0.0042337003 0.0080440305
## [11] 0.0076206605 0.0055038103 0.0029635902 0.0029635902 0.0059271804
## [16] 0.0025402202 0.0055038103 0.0025402202 0.0012701101 0.0033869602
## [21] 0.0025402202 0.0004233700 0.0016934801 0.0008467401 0.0012701101
## [26] 0.0025402202 0.0055038103 0.0021168501 0.0016934801 0.0021168501

Cumulative relative frequencies of peak days

The first entry of each vector corresponds to day 0 (i.e., the day of death).

News:

cumsum(probs_N)
##  [1] 0.2167655 0.5101609 0.6210838 0.6655377 0.6964437 0.7222693 0.7430144
##  [8] 0.7544454 0.7679932 0.7781541 0.7857748 0.7921253 0.7972058 0.8014395
## [15] 0.8052498 0.8086367 0.8149873 0.8158340 0.8187976 0.8217612 0.8264183
## [22] 0.8298052 0.8319221 0.8353091 0.8395428 0.8416596 0.8450466 0.8463167
## [29] 0.8492803 0.8518205

Twitter:

cumsum(probs_T)
##  [1] 0.2878916 0.6181202 0.7180356 0.7662997 0.7988992 0.8158340 0.8314987
##  [8] 0.8399661 0.8441998 0.8522439 0.8598645 0.8653683 0.8683319 0.8712955
## [15] 0.8772227 0.8797629 0.8852667 0.8878069 0.8890771 0.8924640 0.8950042
## [22] 0.8954276 0.8971211 0.8979678 0.8992379 0.9017782 0.9072820 0.9093988
## [29] 0.9110923 0.9132091

People with late peak days (after day 29)

Those whose peak day comes after day 29 tend to be those with low short-term boosts, as seen by computing the relative rank with respect to short-term boost of those with a late peak.

News:

max_rank <- length(max_days_N)-1
(summary(stats_N$peak_mean_boost_rank[max_days_N>29])-1) / max_rank
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.2238  0.4439  0.4620  0.6837  0.9958
# Compare this to the fraction of people with a late peak day.
(sum(max_days_N>29)-1) / max_rank
## [1] 0.1478187

Twitter:

(summary(stats_T$peak_mean_boost_rank[max_days_T>29])-1) / max_rank
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.002541 0.181279 0.430750 0.454508 0.713681 0.998306
# Compare this to the fraction of people with a late peak day.
(sum(max_days_T>29)-1) / max_rank
## [1] 0.08640407

Average mention time series

biexp_model_N <- biexp_fit('NEWS')
biexp_model_T <- biexp_fit('TWITTER')

(a) News

plot_avg_fraction_of_mentioning_docs('NEWS', biexp_model=biexp_model_N)

(b) Twitter

plot_avg_fraction_of_mentioning_docs('TWITTER', biexp_model=biexp_model_T)

Figure 5: Average mention time series, obtained via the arithmetic mean of the individual raw mention time series of the people included in the study, for (a) news and (b) Twitter.

Biexponential model parameters

News:

coef(biexp_model_N)
##            p            N            r            q 
## 2.365624e-01 3.462593e-05 1.489664e-02 5.124453e-04

Twitter:

coef(biexp_model_T)
##            p            N            r            q 
## 2.548633e-01 9.127908e-07 7.908213e-03 4.961750e-04

Proportion of cultural vs. communicative memory

plot_comm_vs_cult_mem('NEWS', biexp_coefs=coef(biexp_model_N))
plot_comm_vs_cult_mem('TWITTER', biexp_coefs=coef(biexp_model_T))

Figure 6: Proportion of cultural vs. communicative memory. On day \(t\) after death, the total collective memory according to the biexponential model fit is \(S(t)=u(t)+v(t)\), where \(u(t)\) is the communicative memory, and \(v(t)\) is the cultural memory. We plot the proportion \(v(t)/S(t)\) as a function of \(t\). Left: news. Right: Twitter.

Alternative models

(a) News

plot_all_model_fits('NEWS')

(b) Twitter

plot_all_model_fits('TWITTER')
## Warning in nls(log(y) ~ -log(a + b * t^c), start = list(a = a0, b = b0, :
## Convergence failure: function evaluation limit reached without convergence (9)

Figure 7: Comparison of eight models \(S(t)\) for fitting empirical mention frequencies, for (a) the news and (b) Twitter. All \(y\)-axes are logarithmic. The left and right plots in each row show the same fits, the only difference being that the left plots have linear \(x\)-axes, whereas the right plots have logarithmic \(x\)-axes. Fits are nonlinear least-squares fits obtained in log space, i.e., logarithms were taken of both the empirical data and the model \(S(t)\) before performing the least-squares optimization (see equation (5) in the paper for the case of the biexponential model). The biexponential function was introduced by Candia et al. (“The universal decay of collective memory and attention.” Nature Human Behaviour. 2019; 3(1):82-91) and is defined in equation (4) of the paper. Theoretical motivations for six of the seven remaining functions are given by Rubin and Wenzel (“One hundred years of forgetting: A quantitative description of retention.” Psychological Review. 1996; 103(4):734-760). Four of these functions are parameterized by two parameters \(a,b>0\), as follows:

The four above functions share the shortcoming of being concave (exponential, hyperbolic, logarithmic) or linear (power) when plotted on log-log axes (i.e., \(\log S(t)\) is a concave function of \(\log t\), cf. right columns), whereas the empirical curves are convex on log-log axes.

Rubin and Wenzel also proposed generalized versions of the exponential and hyperbolic functions, called exponential-power and hyperbolic-power functions, respectively, where \(t\) is replaced by the power \(t^c\), where \(c\) is a third parameter. (Analogous generalized versions of the logarithmic and power functions are not necessary, as they can already be expressed by the plain logarithmic and power functions, since \(b \log t^c = (bc) \log t\).) For \(c>0\), the exponential-power and hyperbolic-power functions, too, are concave in log-log space, but when allowing for \(b,c < 0\), they can be made convex and are thus better suited for fitting the empirical data (note that, in the following specifications, we maintain \(a,b,c > 0\), but replace \(b\) by \(-b\), and \(c\) by \(-c\)):

Finally, as the seventh function, we consider what Candia et al. refer to as the “log-normal” function, defined as \(S(t) = \exp(\log a - b \log t - c (\log t)^2)\). To recognize the fact that, although this function takes the functional form of the log-normal distribution, it is not actually used to describe a probability distribution here, we refer to the function as “log-normal-inspired”. The log-normal-inspired function, too, is concave in log-log space, but can be made convex by replacing \(c\) by \(-c\), i.e., \(S(t) = \exp(\log a - b \log t + c (\log t)^2)\). Note, however, that this results in \(S(t)\) being an increasing function of \(t\) as \(t \rightarrow \infty\), unlike the empirical data and unlike what one would require from a sound theoretical model of collective memory. We hence constrain the parameters such that the fitted function is monotonically decreasing over the modeled time range (days 1 to \(t_{\max}=400\)). Since the unconstrained function, when fitted to the empirical data, assumes a minimum at \(1<t<t_{\max}\), the monotonicity constraint is equivalent to requiring the minimum to occur at \(t=t_{\max}\), which happens for \(b=-2ct_{\max}\) and gives rise to the following model:

We quantify goodness of fit using two measures (results in the legends of the right plots): (1) via coefficients of determination (\(R^2\); computed as the squared correlation between observed and predicted values on the log scale; larger is better); (2) in order to account for the varying model complexity (the models have between two and four parameters), via Akaike’s information criterion (AIC; smaller is better). \(R^2\) and AIC result in the same ordering of the eight models, and the ordering is identical across the two media (news and Twitter). In the figure, the models are sorted, top-down, in increasing order of goodness of fit. The biexponential model provides the best fit according to both measures (\(R^2\) and AIC) and for both media (news and Twitter).

Mention time series characteristics

Summaries of distributions of mention time series characteristics

(a) News

summ <- summary(stats_N[, c('mean_before', 'peak_mean_boost', 'perm_boost', 'time_till_half')])
colnames(summ) <- FANCY_CURVE_CHAR_NAMES[trimws(colnames(summ))]
summ
##  Pre-mortem mean  Short-term boost  Long-term boost      Halving time     
##  Min.   :-5.851   Min.   :-0.2055   Min.   :-1.5490200   Min.   :0.01667  
##  1st Qu.:-5.835   1st Qu.: 1.1924   1st Qu.:-0.0147554   1st Qu.:0.05903  
##  Median :-5.819   Median : 1.9754   Median : 0.0005455   Median :0.16111  
##  Mean   :-5.755   Mean   : 1.9171   Mean   : 0.0191381   Mean   :0.23172  
##  3rd Qu.:-5.770   3rd Qu.: 2.6687   3rd Qu.: 0.0270671   3rd Qu.:0.35833  
##  Max.   :-3.267   Max.   : 4.2103   Max.   : 1.3884812   Max.   :0.95833

(b) Twitter

summ <- summary(stats_T[, c('mean_before', 'peak_mean_boost', 'perm_boost', 'time_till_half')])
colnames(summ) <- FANCY_CURVE_CHAR_NAMES[trimws(colnames(summ))]
summ
##  Pre-mortem mean  Short-term boost  Long-term boost     Halving time     
##  Min.   :-7.866   Min.   :-0.0413   Min.   :-0.557536   Min.   :0.01667  
##  1st Qu.:-7.839   1st Qu.: 1.5856   1st Qu.:-0.006985   1st Qu.:0.06667  
##  Median :-7.803   Median : 2.4474   Median : 0.016047   Median :0.15972  
##  Mean   :-7.664   Mean   : 2.3262   Mean   : 0.055143   Mean   :0.22363  
##  3rd Qu.:-7.671   3rd Qu.: 3.0998   3rd Qu.: 0.077694   3rd Qu.:0.33611  
##  Max.   :-4.576   Max.   : 4.9904   Max.   : 1.552516   Max.   :0.88056

Table 2: Summaries of distribution of mention time series characteristics for (a) the news and (b) Twitter.

\[\\[4mm]\]

plot_smoothed_densities_raw(stats_N, 'NEWS', 'mid', list(unique(stats_N$mid)), main='NEWS')

plot_smoothed_densities_raw(stats_T, 'TWITTER', 'mid', list(unique(stats_T$mid)), main='TWITTER')

Figure 8: Kernel-smoothed density plots of mention time series characteristics for the news (top) and Twitter (bottom).

Magnitude of short- and long-term boosts

Short-term boost

News:

boot_ci(stats_N$peak_mean_boost, median)
##     upper point_est     lower 
##  2.031260  1.975446  1.896458
wilcox.test(stats_N$peak_mean_boost, mu=0, conf.int=TRUE)
## 
##  Wilcoxon signed rank test with continuity correction
## 
## data:  stats_N$peak_mean_boost
## V = 2790570, p-value < 2.2e-16
## alternative hypothesis: true location is not equal to 0
## 95 percent confidence interval:
##  1.886672 1.967571
## sample estimates:
## (pseudo)median 
##       1.927002

Twitter:

boot_ci(stats_T$peak_mean_boost, median)
##     upper point_est     lower 
##  2.500643  2.447381  2.372522
wilcox.test(stats_T$peak_mean_boost, mu=0, conf.int=TRUE)
## 
##  Wilcoxon signed rank test with continuity correction
## 
## data:  stats_T$peak_mean_boost
## V = 2790610, p-value < 2.2e-16
## alternative hypothesis: true location is not equal to 0
## 95 percent confidence interval:
##  2.309257 2.402026
## sample estimates:
## (pseudo)median 
##       2.355734

Comparison of short-term boosts in news vs. Twitter

wilcox.test(x=stats_N$peak_mean_boost, y=stats_T$peak_mean_boost, paired=TRUE, conf.int=TRUE)
## 
##  Wilcoxon signed rank test with continuity correction
## 
## data:  stats_N$peak_mean_boost and stats_T$peak_mean_boost
## V = 477893, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  -0.4415986 -0.3917311
## sample estimates:
## (pseudo)median 
##      -0.416588

Long-term boost

News:

boot_ci(stats_N$perm_boost, median)
##         upper     point_est         lower 
##  0.0017260233  0.0005454994 -0.0009987135
wilcox.test(stats_N$perm_boost, mu=0, conf.int=TRUE)
## 
##  Wilcoxon signed rank test with continuity correction
## 
## data:  stats_N$perm_boost
## V = 1560596, p-value = 6.199e-07
## alternative hypothesis: true location is not equal to 0
## 95 percent confidence interval:
##  0.002285753 0.005541149
## sample estimates:
## (pseudo)median 
##    0.003893564

Twitter:

boot_ci(stats_T$perm_boost, median)
##      upper  point_est      lower 
## 0.01751758 0.01604730 0.01334752
wilcox.test(stats_T$perm_boost, mu=0, conf.int=TRUE)
## 
##  Wilcoxon signed rank test with continuity correction
## 
## data:  stats_T$perm_boost
## V = 2053579, p-value < 2.2e-16
## alternative hypothesis: true location is not equal to 0
## 95 percent confidence interval:
##  0.02424449 0.03112195
## sample estimates:
## (pseudo)median 
##     0.02754178

Comparison of long-term boosts in news vs. Twitter

wilcox.test(x=stats_N$perm_boost, y=stats_T$perm_boost, paired=TRUE, conf.int=TRUE)
## 
##  Wilcoxon signed rank test with continuity correction
## 
## data:  stats_N$perm_boost and stats_T$perm_boost
## V = 881590, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  -0.02542398 -0.01933168
## sample estimates:
## (pseudo)median 
##    -0.02231052

Mention time series characteristics by notability type

(a) News

plot_chars_by_group(stats_N, 'NEWS', stats_N$type_group, type_groups)

(b) Twitter

plot_chars_by_group(stats_T, 'TWITTER', stats_T$type_group, type_groups)

Figure 9: Distributions of mention time series characteristics by notability type, visualized as box plots, for (a) the news and (b) Twitter. Boxes are bounded by the first and third quartiles; whiskers extend 1.5 inter-quartile ranges beyond the first and third quartiles (or to the minimum/maximum in case they fall within 1.5 inter-quartile ranges); the center bars mark medians, with notches corresponding to 95% confidence intervals of the median.

Cluster analysis

Optimal number of clusters

plot_sil_width('NEWS', cluster_result_N$clustering, cluster_result_N$diss)
plot_sil_width('TWITTER', cluster_result_T$clustering, cluster_result_T$diss)

Figure 10: Average silhouette width of clusterings produced by \(k\)-means algorithm, as a function of the number \(k\) of clusters (higher is better), for \(k \in \{2, \dots, 30\}\). Both (a) in the news and (b) on Twitter, \(k=4\) is optimal.

Overlay of time series per cluster

plot_clustered_time_series('NEWS', km_N, X_N)

plot_clustered_time_series('TWITTER', km_T, X_T)

Figure 11: Overlay of time series in each cluster, for news (top) and Twitter (bottom).

Analysis of cluster confusion matrix

News and Twitter are not independent

X <- table(as.factor(km_N$cluster), as.factor(km_T$cluster))
chisq.test(X, simulate.p.value=TRUE, B=1e5)
## 
##  Pearson's Chi-squared test with simulated p-value (based on 1e+05
##  replicates)
## 
## data:  X
## X-squared = 1739.3, df = NA, p-value = 1e-05

Log ratios of empirical vs. null probabilities

Null probabilites represent the case where news and Twitter are assumed to be independent (with the constraint that cluster sizes must remain constant).

The following matrix shows the log10 ratio of the empirical cluster size and the cluster size under the null model (rows are news cluster, columns are Twitter clusters). That is, positive values imply overrepresentation in the empirical data, and negative values, underrepresentation. We see that all diagonal entries are overrepresented, whereas all off-diagonal entries except two—\((3,4)\) and \((4,3)\)—are underrepresented.

X.indep <- (rowSums(X) %o% colSums(X)) / sum(X)
log10(X / X.indep)
##    
##               1           2           3           4
##   1  0.12144877 -0.41341084 -0.03466453 -0.37941148
##   2 -0.31927109  0.43136894 -0.77466424 -0.76366886
##   3 -0.40569573 -1.34225550  0.68375399  0.72277810
##   4 -0.24239967 -0.89911874  0.01774628  1.06900484

Overrepresentation of diagonal entries

To quantify the overrepresentation of the diagonal, we compute four diagonal sums (everything under the constraint that the row and column sums are fixed to the empirical values):

  1. the minimum possible,
  2. the independent null model,
  3. the empirical data,
  4. the maximum possible.

The minimum and maximum are computed via linear programming.

k <- nrow(X)
objective.in <- as.vector(diag(nrow=k))
const.mat <- rbind(t(diag(1,k) %x% rep(1,k)),
                   t(rep(1,k) %x% diag(1,k)))
const.rhs <- c(colSums(X), rowSums(X))
const.dir <- rep('==', k)
opt.max <- lp('max', objective.in, const.mat, const.dir, const.rhs)
X.max <- matrix(opt.max$solution, k, k)
opt.min <- lp('min', objective.in, const.mat, const.dir, const.rhs)
X.min <- matrix(opt.min$solution, k, k)

The values of the four above-described diagonal sums are as follows:

sums <- c(sum(diag(X.min)), sum(diag(X.indep)), sum(diag(X)), sum(diag(X.max)))
sums
## [1]  505.000 1059.347 1727.000 2236.000

Normalizing by the min-to-max span, we see that the empirical diagonal gets 71% of the way between min and max, whereas the null only gets 32%, for a factor of 2.2.

(sums - sum(diag(X.min))) / (sum(diag(X.max)) - sum(diag(X.min)))
## [1] 0.0000000 0.3202468 0.7059503 1.0000000

Overrepresentation of cluster combinations (3, 4) and (4, 3)

On the contrary, nearly all off-diagonal entries are underrepresented, with the exception of \((4,3)\), which is not significantly different from the null:

prop.test(X[4,3], sum(X), X.indep[4,3]/sum(X))
## 
##  1-sample proportions test with continuity correction
## 
## data:  X[4, 3] out of sum(X), null probability X.indep[4, 3]/sum(X)
## X-squared = 5.0682e-29, df = 1, p-value = 1
## alternative hypothesis: true p is not equal to 0.002844932
## 95 percent confidence interval:
##  0.001359169 0.006264032
## sample estimates:
##          p 
## 0.00296359

and \((3,4)\), which is massively overrepresented, compared to the null:

prop.test(X[3,4], sum(X), X.indep[3,4]/sum(X))
## 
##  1-sample proportions test with continuity correction
## 
## data:  X[3, 4] out of sum(X), null probability X.indep[3, 4]/sum(X)
## X-squared = 135.03, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is not equal to 0.003206284
## 95 percent confidence interval:
##  0.01228142 0.02321996
## sample estimates:
##         p 
## 0.0169348

Regression modeling

Note: the variable names in the code may differ from those in the paper; see mapping in lists below.

Independent variables:

Dependendent variables:

Models without interactions

for (ranks in c(FALSE, TRUE)) {
  mods_N <- run_regression_analysis('NEWS', ranks)
  mods_T <- run_regression_analysis('TWITTER', ranks)
  
  print_regression_table(mods_N, mods_T, ranks)
  
  print_model_description('NEWS', 'short-term boost', ranks)
  print(summary(mods_N$mod_peak))
  
  print_model_description('NEWS', 'long-term boost', ranks)
  print(summary(mods_N$mod_perm))
  
  print_model_description('TWITTER', 'short-term boost', ranks)
  print(summary(mods_T$mod_peak))
  
  print_model_description('TWITTER', 'long-term boost', ranks)
  print(summary(mods_T$mod_perm))
}
## 
## 
## #####################################################
## REGRESSION MODEL
## Medium: NEWS
## Dependent variable: short-term boost
## Transformation on dependent variable: NONE
## #####################################################
## 
## Call:
## lm(formula = as.formula(sprintf("peak_mean_boost%s ~ %s", suffix, 
##     predictors)), data = lmdata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.3921 -0.4654  0.1215  0.5730  2.0660 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     2.32157    0.06293  36.890  < 2e-16 ***
## mean_before_relrank             0.80417    0.09297   8.650  < 2e-16 ***
## age_group20                     0.16165    0.17015   0.950   0.3423    
## age_group30                     0.40001    0.16740   2.390   0.0171 *  
## age_group40                    -0.04607    0.12599  -0.366   0.7147    
## age_group50                    -0.07459    0.09893  -0.754   0.4511    
## age_group60                    -0.10927    0.08172  -1.337   0.1816    
## age_group80                     0.02162    0.07826   0.276   0.7824    
## age_group90                     0.17414    0.09779   1.781   0.0753 .  
## death_typeunnatural             0.61845    0.09460   6.537 1.08e-10 ***
## genderFemale                    0.08287    0.07213   1.149   0.2509    
## anglonon_anglo                 -0.31567    0.07378  -4.279 2.09e-05 ***
## anglounknown                   -0.44551    0.08550  -5.211 2.36e-07 ***
## type_groupacademia/engineering  0.18120    0.19745   0.918   0.3590    
## type_groupgeneral fame          0.06993    0.12391   0.564   0.5727    
## type_groupknown for death      -0.10659    0.09933  -1.073   0.2835    
## type_groupleadership            0.15217    0.08285   1.837   0.0666 .  
## type_groupsports                0.04886    0.08336   0.586   0.5580    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7725 on 852 degrees of freedom
## Multiple R-squared:  0.2131, Adjusted R-squared:  0.1974 
## F-statistic: 13.57 on 17 and 852 DF,  p-value: < 2.2e-16
## 
## 
## 
## #####################################################
## REGRESSION MODEL
## Medium: NEWS
## Dependent variable: long-term boost
## Transformation on dependent variable: NONE
## #####################################################
## 
## Call:
## lm(formula = as.formula(sprintf("perm_boost%s ~ %s", suffix, 
##     predictors)), data = lmdata)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.92517 -0.07187 -0.02152  0.03771  1.19435 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     0.0879028  0.0137837   6.377 2.95e-10 ***
## mean_before_relrank             0.0305324  0.0203625   1.499  0.13413    
## age_group20                     0.0603247  0.0372663   1.619  0.10587    
## age_group30                     0.0277988  0.0366633   0.758  0.44853    
## age_group40                    -0.0009074  0.0275939  -0.033  0.97377    
## age_group50                    -0.0578543  0.0216672  -2.670  0.00773 ** 
## age_group60                    -0.0499194  0.0178988  -2.789  0.00541 ** 
## age_group80                    -0.0181927  0.0171416  -1.061  0.28884    
## age_group90                    -0.0111474  0.0214176  -0.520  0.60287    
## death_typeunnatural             0.0973507  0.0207195   4.699 3.05e-06 ***
## genderFemale                    0.0343766  0.0157970   2.176  0.02982 *  
## anglonon_anglo                 -0.0610353  0.0161588  -3.777  0.00017 ***
## anglounknown                   -0.0790566  0.0187267  -4.222 2.69e-05 ***
## type_groupacademia/engineering -0.0315605  0.0432446  -0.730  0.46570    
## type_groupgeneral fame         -0.0102523  0.0271399  -0.378  0.70571    
## type_groupknown for death      -0.0213581  0.0217556  -0.982  0.32651    
## type_groupleadership           -0.0581051  0.0181453  -3.202  0.00141 ** 
## type_groupsports               -0.0341565  0.0182585  -1.871  0.06173 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1692 on 852 degrees of freedom
## Multiple R-squared:  0.1234, Adjusted R-squared:  0.1059 
## F-statistic: 7.056 on 17 and 852 DF,  p-value: 3.001e-16
## 
## 
## 
## #####################################################
## REGRESSION MODEL
## Medium: TWITTER
## Dependent variable: short-term boost
## Transformation on dependent variable: NONE
## #####################################################
## 
## Call:
## lm(formula = as.formula(sprintf("peak_mean_boost%s ~ %s", suffix, 
##     predictors)), data = lmdata)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.97464 -0.40997  0.07498  0.57097  2.10379 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     2.67046    0.06666  40.061  < 2e-16 ***
## mean_before_relrank             0.94776    0.09955   9.521  < 2e-16 ***
## age_group20                     0.71833    0.17951   4.002 6.84e-05 ***
## age_group30                     0.64917    0.17658   3.676 0.000251 ***
## age_group40                     0.35056    0.13278   2.640 0.008438 ** 
## age_group50                     0.18143    0.10435   1.739 0.082470 .  
## age_group60                     0.13015    0.08611   1.511 0.131051    
## age_group80                     0.02061    0.08247   0.250 0.802727    
## age_group90                     0.03437    0.10319   0.333 0.739172    
## death_typeunnatural             0.28194    0.09978   2.826 0.004829 ** 
## genderFemale                   -0.03354    0.07591  -0.442 0.658720    
## anglonon_anglo                 -0.11627    0.07763  -1.498 0.134593    
## anglounknown                   -0.32516    0.09079  -3.582 0.000361 ***
## type_groupacademia/engineering  0.33965    0.20822   1.631 0.103220    
## type_groupgeneral fame          0.13243    0.13075   1.013 0.311418    
## type_groupknown for death      -0.08814    0.10562  -0.835 0.404208    
## type_groupleadership            0.11290    0.08677   1.301 0.193592    
## type_groupsports                0.07169    0.08830   0.812 0.417031    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8148 on 852 degrees of freedom
## Multiple R-squared:  0.1917, Adjusted R-squared:  0.1755 
## F-statistic: 11.88 on 17 and 852 DF,  p-value: < 2.2e-16
## 
## 
## 
## #####################################################
## REGRESSION MODEL
## Medium: TWITTER
## Dependent variable: long-term boost
## Transformation on dependent variable: NONE
## #####################################################
## 
## Call:
## lm(formula = as.formula(sprintf("perm_boost%s ~ %s", suffix, 
##     predictors)), data = lmdata)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.55367 -0.08110 -0.01437  0.05324  1.17998 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     0.095062   0.014816   6.416 2.32e-10 ***
## mean_before_relrank             0.086441   0.022127   3.907 0.000101 ***
## age_group20                     0.192254   0.039899   4.818 1.71e-06 ***
## age_group30                     0.118044   0.039248   3.008 0.002711 ** 
## age_group40                     0.099524   0.029512   3.372 0.000779 ***
## age_group50                    -0.024487   0.023195  -1.056 0.291393    
## age_group60                    -0.024520   0.019140  -1.281 0.200516    
## age_group80                    -0.012644   0.018330  -0.690 0.490506    
## age_group90                    -0.023678   0.022936  -1.032 0.302203    
## death_typeunnatural             0.106270   0.022178   4.792 1.95e-06 ***
## genderFemale                    0.005622   0.016873   0.333 0.739083    
## anglonon_anglo                 -0.037016   0.017256  -2.145 0.032225 *  
## anglounknown                   -0.080837   0.020179  -4.006 6.72e-05 ***
## type_groupacademia/engineering  0.023237   0.046282   0.502 0.615747    
## type_groupgeneral fame         -0.007989   0.029062  -0.275 0.783457    
## type_groupknown for death       0.008185   0.023476   0.349 0.727436    
## type_groupleadership           -0.039897   0.019287  -2.069 0.038885 *  
## type_groupsports               -0.034255   0.019626  -1.745 0.081269 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1811 on 852 degrees of freedom
## Multiple R-squared:  0.1776, Adjusted R-squared:  0.1612 
## F-statistic: 10.82 on 17 and 852 DF,  p-value: < 2.2e-16
## 
## 
## 
## #####################################################
## REGRESSION MODEL
## Medium: NEWS
## Dependent variable: short-term boost
## Transformation on dependent variable: RELATIVE RANKS
## #####################################################
## 
## Call:
## lm(formula = as.formula(sprintf("peak_mean_boost%s ~ %s", suffix, 
##     predictors)), data = lmdata)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.62044 -0.19422  0.01128  0.20580  0.74237 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     0.001595   0.020820   0.077   0.9390    
## mean_before_relrank             0.292185   0.030757   9.500  < 2e-16 ***
## age_group20                     0.054393   0.056290   0.966   0.3342    
## age_group30                     0.119792   0.055379   2.163   0.0308 *  
## age_group40                    -0.020965   0.041680  -0.503   0.6151    
## age_group50                    -0.029145   0.032728  -0.891   0.3734    
## age_group60                    -0.046225   0.027036  -1.710   0.0877 .  
## age_group80                     0.011330   0.025892   0.438   0.6618    
## age_group90                     0.070412   0.032351   2.177   0.0298 *  
## death_typeunnatural             0.223393   0.031296   7.138 2.03e-12 ***
## genderFemale                    0.032258   0.023861   1.352   0.1768    
## anglonon_anglo                 -0.100770   0.024407  -4.129 4.01e-05 ***
## anglounknown                   -0.149597   0.028286  -5.289 1.57e-07 ***
## type_groupacademia/engineering  0.054180   0.065320   0.829   0.4071    
## type_groupgeneral fame          0.025013   0.040994   0.610   0.5419    
## type_groupknown for death      -0.041843   0.032861  -1.273   0.2033    
## type_groupleadership            0.027634   0.027408   1.008   0.3136    
## type_groupsports                0.001862   0.027579   0.068   0.9462    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2555 on 852 degrees of freedom
## Multiple R-squared:  0.2343, Adjusted R-squared:  0.219 
## F-statistic: 15.34 on 17 and 852 DF,  p-value: < 2.2e-16
## 
## 
## 
## #####################################################
## REGRESSION MODEL
## Medium: NEWS
## Dependent variable: long-term boost
## Transformation on dependent variable: RELATIVE RANKS
## #####################################################
## 
## Call:
## lm(formula = as.formula(sprintf("perm_boost%s ~ %s", suffix, 
##     predictors)), data = lmdata)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.60548 -0.21923  0.00579  0.23231  0.57350 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     0.069386   0.022386   3.099 0.002002 ** 
## mean_before_relrank            -0.035601   0.033071  -1.076 0.282009    
## age_group20                     0.014011   0.060525   0.231 0.816994    
## age_group30                    -0.025187   0.059546  -0.423 0.672407    
## age_group40                    -0.043987   0.044816  -0.982 0.326618    
## age_group50                    -0.116348   0.035190  -3.306 0.000985 ***
## age_group60                    -0.085145   0.029070  -2.929 0.003491 ** 
## age_group80                    -0.009021   0.027840  -0.324 0.745990    
## age_group90                     0.039381   0.034785   1.132 0.257902    
## death_typeunnatural             0.134966   0.033651   4.011 6.58e-05 ***
## genderFemale                    0.037946   0.025656   1.479 0.139504    
## anglonon_anglo                 -0.108631   0.026244  -4.139 3.83e-05 ***
## anglounknown                   -0.145758   0.030415  -4.792 1.94e-06 ***
## type_groupacademia/engineering -0.019779   0.070235  -0.282 0.778306    
## type_groupgeneral fame         -0.002759   0.044079  -0.063 0.950104    
## type_groupknown for death      -0.064889   0.035334  -1.836 0.066638 .  
## type_groupleadership           -0.071294   0.029470  -2.419 0.015763 *  
## type_groupsports               -0.042940   0.029654  -1.448 0.147977    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2748 on 852 degrees of freedom
## Multiple R-squared:  0.1148, Adjusted R-squared:  0.0971 
## F-statistic: 6.498 on 17 and 852 DF,  p-value: 1.155e-14
## 
## 
## 
## #####################################################
## REGRESSION MODEL
## Medium: TWITTER
## Dependent variable: short-term boost
## Transformation on dependent variable: RELATIVE RANKS
## #####################################################
## 
## Call:
## lm(formula = as.formula(sprintf("peak_mean_boost%s ~ %s", suffix, 
##     predictors)), data = lmdata)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.71673 -0.18473 -0.00208  0.20710  0.61854 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    -0.033516   0.021153  -1.584 0.113459    
## mean_before_relrank             0.336775   0.031589  10.661  < 2e-16 ***
## age_group20                     0.224402   0.056963   3.939 8.84e-05 ***
## age_group30                     0.216619   0.056034   3.866 0.000119 ***
## age_group40                     0.099944   0.042134   2.372 0.017910 *  
## age_group50                     0.048430   0.033115   1.462 0.143973    
## age_group60                     0.038284   0.027326   1.401 0.161574    
## age_group80                     0.003695   0.026169   0.141 0.887740    
## age_group90                     0.006365   0.032745   0.194 0.845930    
## death_typeunnatural             0.095068   0.031662   3.003 0.002756 ** 
## genderFemale                   -0.007467   0.024089  -0.310 0.756643    
## anglonon_anglo                 -0.030620   0.024636  -1.243 0.214240    
## anglounknown                   -0.107934   0.028809  -3.746 0.000191 ***
## type_groupacademia/engineering  0.120612   0.066075   1.825 0.068295 .  
## type_groupgeneral fame          0.052868   0.041492   1.274 0.202944    
## type_groupknown for death      -0.026970   0.033516  -0.805 0.421209    
## type_groupleadership            0.025037   0.027535   0.909 0.363470    
## type_groupsports                0.026369   0.028019   0.941 0.346917    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2586 on 852 degrees of freedom
## Multiple R-squared:  0.2161, Adjusted R-squared:  0.2004 
## F-statistic: 13.82 on 17 and 852 DF,  p-value: < 2.2e-16
## 
## 
## 
## #####################################################
## REGRESSION MODEL
## Medium: TWITTER
## Dependent variable: long-term boost
## Transformation on dependent variable: RELATIVE RANKS
## #####################################################
## 
## Call:
## lm(formula = as.formula(sprintf("perm_boost%s ~ %s", suffix, 
##     predictors)), data = lmdata)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.67278 -0.19342  0.02505  0.21969  0.60132 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     0.038607   0.022426   1.722 0.085511 .  
## mean_before_relrank             0.118846   0.033490   3.549 0.000408 ***
## age_group20                     0.111425   0.060390   1.845 0.065372 .  
## age_group30                     0.106071   0.059405   1.786 0.074526 .  
## age_group40                     0.060312   0.044669   1.350 0.177307    
## age_group50                    -0.060151   0.035107  -1.713 0.087008 .  
## age_group60                    -0.044402   0.028970  -1.533 0.125726    
## age_group80                    -0.015164   0.027744  -0.547 0.584804    
## age_group90                    -0.033081   0.034715  -0.953 0.340897    
## death_typeunnatural             0.096865   0.033567   2.886 0.004004 ** 
## genderFemale                    0.029224   0.025538   1.144 0.252806    
## anglonon_anglo                 -0.031642   0.026118  -1.212 0.226027    
## anglounknown                   -0.139591   0.030542  -4.570 5.59e-06 ***
## type_groupacademia/engineering  0.076049   0.070050   1.086 0.277952    
## type_groupgeneral fame          0.006527   0.043988   0.148 0.882072    
## type_groupknown for death      -0.025012   0.035532  -0.704 0.481664    
## type_groupleadership           -0.102206   0.029192  -3.501 0.000487 ***
## type_groupsports               -0.025534   0.029704  -0.860 0.390257    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2741 on 852 degrees of freedom
## Multiple R-squared:  0.1189, Adjusted R-squared:  0.1013 
## F-statistic: 6.765 on 17 and 852 DF,  p-value: 2.014e-15

Models with “age by manner of death” interaction

## 
## 
## #####################################################
## REGRESSION MODEL
## Medium: NEWS
## Dependent variable: short-term boost
## Transformation on dependent variable: NONE
## #####################################################
## 
## Call:
## lm(formula = as.formula(sprintf("peak_mean_boost%s ~ %s", suffix, 
##     predictors)), data = lmdata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.3913 -0.4656  0.1301  0.5624  2.0282 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## mean_before_relrank              0.79350    0.09382   8.458  < 2e-16 ***
## age_group70                      2.31291    0.06392  36.184  < 2e-16 ***
## age_group20                      2.70452    0.26667  10.142  < 2e-16 ***
## age_group30                      2.62259    0.23921  10.964  < 2e-16 ***
## age_group40                      2.22989    0.13600  16.397  < 2e-16 ***
## age_group50                      2.28254    0.09768  23.368  < 2e-16 ***
## age_group60                      2.21414    0.06950  31.859  < 2e-16 ***
## age_group80                      2.34947    0.06286  37.377  < 2e-16 ***
## age_group90                      2.50226    0.08461  29.574  < 2e-16 ***
## death_typeunnatural              0.93213    0.30056   3.101  0.00199 ** 
## genderFemale                     0.08619    0.07254   1.188  0.23513    
## anglonon_anglo                  -0.32918    0.07476  -4.403 1.20e-05 ***
## anglounknown                    -0.45736    0.08604  -5.316 1.36e-07 ***
## type_groupacademia/engineering   0.17904    0.19785   0.905  0.36577    
## type_groupgeneral fame           0.05273    0.12475   0.423  0.67264    
## type_groupknown for death       -0.09979    0.09996  -0.998  0.31843    
## type_groupleadership             0.15945    0.08318   1.917  0.05559 .  
## type_groupsports                 0.04115    0.08387   0.491  0.62376    
## age_group20:death_typeunnatural -0.63094    0.43567  -1.448  0.14793    
## age_group30:death_typeunnatural -0.14156    0.42659  -0.332  0.74009    
## age_group40:death_typeunnatural -0.16313    0.37877  -0.431  0.66682    
## age_group50:death_typeunnatural -0.44845    0.35775  -1.254  0.21037    
## age_group60:death_typeunnatural -0.30088    0.35364  -0.851  0.39511    
## age_group80:death_typeunnatural -0.47493    0.54408  -0.873  0.38296    
## age_group90:death_typeunnatural -0.81402    0.83740  -0.972  0.33129    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7739 on 845 degrees of freedom
## Multiple R-squared:  0.9055, Adjusted R-squared:  0.9027 
## F-statistic: 323.8 on 25 and 845 DF,  p-value: < 2.2e-16
## 
## 
## 
## #####################################################
## REGRESSION MODEL
## Medium: NEWS
## Dependent variable: long-term boost
## Transformation on dependent variable: NONE
## #####################################################
## 
## Call:
## lm(formula = as.formula(sprintf("perm_boost%s ~ %s", suffix, 
##     predictors)), data = lmdata)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.92547 -0.07094 -0.02103  0.03979  1.15319 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## mean_before_relrank              0.025414   0.020412   1.245 0.213448    
## age_group70                      0.089665   0.013907   6.448 1.91e-10 ***
## age_group20                      0.060730   0.058016   1.047 0.295500    
## age_group30                      0.055130   0.052042   1.059 0.289748    
## age_group40                      0.067078   0.029587   2.267 0.023633 *  
## age_group50                      0.053144   0.021251   2.501 0.012580 *  
## age_group60                      0.037372   0.015120   2.472 0.013643 *  
## age_group80                      0.070787   0.013675   5.176 2.83e-07 ***
## age_group90                      0.078577   0.018407   4.269 2.19e-05 ***
## death_typeunnatural              0.054179   0.065388   0.829 0.407578    
## genderFemale                     0.035626   0.015782   2.257 0.024240 *  
## anglonon_anglo                  -0.061710   0.016264  -3.794 0.000159 ***
## anglounknown                    -0.080713   0.018718  -4.312 1.81e-05 ***
## type_groupacademia/engineering  -0.032227   0.043044  -0.749 0.454248    
## type_groupgeneral fame          -0.016796   0.027139  -0.619 0.536159    
## type_groupknown for death       -0.024095   0.021748  -1.108 0.268199    
## type_groupleadership            -0.057120   0.018097  -3.156 0.001654 ** 
## type_groupsports                -0.032155   0.018246  -1.762 0.078382 .  
## age_group20:death_typeunnatural  0.171955   0.094784   1.814 0.070003 .  
## age_group30:death_typeunnatural  0.144053   0.092807   1.552 0.120995    
## age_group40:death_typeunnatural  0.103868   0.082405   1.260 0.207855    
## age_group50:death_typeunnatural -0.060255   0.077832  -0.774 0.439047    
## age_group60:death_typeunnatural  0.053030   0.076937   0.689 0.490847    
## age_group80:death_typeunnatural  0.009704   0.118368   0.082 0.934680    
## age_group90:death_typeunnatural -0.144978   0.182183  -0.796 0.426381    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1684 on 845 degrees of freedom
## Multiple R-squared:  0.1994, Adjusted R-squared:  0.1757 
## F-statistic: 8.418 on 25 and 845 DF,  p-value: < 2.2e-16

## 
## 
## #####################################################
## REGRESSION MODEL
## Medium: TWITTER
## Dependent variable: short-term boost
## Transformation on dependent variable: NONE
## #####################################################
## 
## Call:
## lm(formula = as.formula(sprintf("peak_mean_boost%s ~ %s", suffix, 
##     predictors)), data = lmdata)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.94216 -0.41102  0.07389  0.57059  2.18694 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## mean_before_relrank              0.94026    0.10036   9.369  < 2e-16 ***
## age_group70                      2.66931    0.06775  39.397  < 2e-16 ***
## age_group20                      3.47492    0.28167  12.337  < 2e-16 ***
## age_group30                      3.29553    0.25261  13.046  < 2e-16 ***
## age_group40                      2.93793    0.14373  20.441  < 2e-16 ***
## age_group50                      2.87237    0.10330  27.806  < 2e-16 ***
## age_group60                      2.80662    0.07339  38.240  < 2e-16 ***
## age_group80                      2.70010    0.06692  40.350  < 2e-16 ***
## age_group90                      2.70831    0.08941  30.292  < 2e-16 ***
## death_typeunnatural              0.37666    0.31740   1.187 0.235683    
## genderFemale                    -0.03440    0.07644  -0.450 0.652773    
## anglonon_anglo                  -0.12447    0.07878  -1.580 0.114517    
## anglounknown                    -0.33028    0.09150  -3.610 0.000325 ***
## type_groupacademia/engineering   0.33479    0.20886   1.603 0.109313    
## type_groupgeneral fame           0.11715    0.13179   0.889 0.374306    
## type_groupknown for death       -0.08548    0.10636  -0.804 0.421793    
## type_groupleadership             0.11857    0.08718   1.360 0.174159    
## type_groupsports                 0.06552    0.08892   0.737 0.461390    
## age_group20:death_typeunnatural -0.21410    0.46007  -0.465 0.641789    
## age_group30:death_typeunnatural -0.04720    0.45020  -0.105 0.916533    
## age_group40:death_typeunnatural  0.16610    0.39898   0.416 0.677281    
## age_group50:death_typeunnatural -0.16983    0.37827  -0.449 0.653564    
## age_group60:death_typeunnatural -0.12923    0.37346  -0.346 0.729391    
## age_group80:death_typeunnatural -0.49657    0.57411  -0.865 0.387321    
## age_group90:death_typeunnatural -0.24853    0.88473  -0.281 0.778846    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8171 on 845 degrees of freedom
## Multiple R-squared:  0.9241, Adjusted R-squared:  0.9218 
## F-statistic: 411.4 on 25 and 845 DF,  p-value: < 2.2e-16
## 
## 
## 
## #####################################################
## REGRESSION MODEL
## Medium: TWITTER
## Dependent variable: long-term boost
## Transformation on dependent variable: NONE
## #####################################################
## 
## Call:
## lm(formula = as.formula(sprintf("perm_boost%s ~ %s", suffix, 
##     predictors)), data = lmdata)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.64801 -0.07463 -0.01514  0.04831  1.08654 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## mean_before_relrank              0.083630   0.022064   3.790 0.000161 ***
## age_group70                      0.094879   0.014896   6.369 3.11e-10 ***
## age_group20                      0.079564   0.061926   1.285 0.199202    
## age_group30                      0.207745   0.055537   3.741 0.000196 ***
## age_group40                      0.196045   0.031600   6.204 8.61e-10 ***
## age_group50                      0.086549   0.022711   3.811 0.000148 ***
## age_group60                      0.071046   0.016136   4.403 1.21e-05 ***
## age_group80                      0.082052   0.014712   5.577 3.29e-08 ***
## age_group90                      0.072244   0.019656   3.675 0.000253 ***
## death_typeunnatural              0.090447   0.069782   1.296 0.195279    
## genderFemale                     0.006957   0.016805   0.414 0.678970    
## anglonon_anglo                  -0.036397   0.017321  -2.101 0.035909 *  
## anglounknown                    -0.079850   0.020116  -3.969 7.82e-05 ***
## type_groupacademia/engineering   0.022898   0.045918   0.499 0.618139    
## type_groupgeneral fame          -0.011641   0.028974  -0.402 0.687945    
## type_groupknown for death        0.006300   0.023383   0.269 0.787670    
## type_groupleadership            -0.040311   0.019166  -2.103 0.035737 *  
## type_groupsports                -0.028105   0.019549  -1.438 0.150895    
## age_group20:death_typeunnatural  0.318038   0.101148   3.144 0.001723 ** 
## age_group30:death_typeunnatural  0.021538   0.098979   0.218 0.827789    
## age_group40:death_typeunnatural  0.007598   0.087717   0.087 0.930992    
## age_group50:death_typeunnatural -0.063518   0.083164  -0.764 0.445219    
## age_group60:death_typeunnatural  0.003213   0.082106   0.039 0.968796    
## age_group80:death_typeunnatural  0.009910   0.126221   0.079 0.937438    
## age_group90:death_typeunnatural -0.113872   0.194512  -0.585 0.558420    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1796 on 845 degrees of freedom
## Multiple R-squared:  0.3217, Adjusted R-squared:  0.3016 
## F-statistic: 16.03 on 25 and 845 DF,  p-value: < 2.2e-16

## 
## 
## #####################################################
## REGRESSION MODEL
## Medium: NEWS
## Dependent variable: short-term boost
## Transformation on dependent variable: RELATIVE RANKS
## #####################################################
## 
## Call:
## lm(formula = as.formula(sprintf("peak_mean_boost%s ~ %s", suffix, 
##     predictors)), data = lmdata)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.62147 -0.19550  0.01088  0.20087  0.73057 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## mean_before_relrank              0.2876073  0.0310288   9.269  < 2e-16 ***
## age_group70                     -0.0010522  0.0211400  -0.050  0.96031    
## age_group20                      0.1238746  0.0881930   1.405  0.16051    
## age_group30                      0.0812147  0.0791111   1.027  0.30491    
## age_group40                     -0.0379687  0.0449768  -0.844  0.39881    
## age_group50                     -0.0170764  0.0323044  -0.529  0.59722    
## age_group60                     -0.0432214  0.0229844  -1.880  0.06039 .  
## age_group80                      0.0149510  0.0207886   0.719  0.47222    
## age_group90                      0.0749740  0.0279819   2.679  0.00752 ** 
## death_typeunnatural              0.3175289  0.0993996   3.194  0.00145 ** 
## genderFemale                     0.0340646  0.0239914   1.420  0.15602    
## anglonon_anglo                  -0.1047908  0.0247235  -4.239  2.5e-05 ***
## anglounknown                    -0.1532241  0.0284546  -5.385  9.4e-08 ***
## type_groupacademia/engineering   0.0530173  0.0654330   0.810  0.41802    
## type_groupgeneral fame           0.0191595  0.0412557   0.464  0.64247    
## type_groupknown for death       -0.0403775  0.0330598  -1.221  0.22229    
## type_groupleadership             0.0299099  0.0275100   1.087  0.27724    
## type_groupsports                -0.0005107  0.0277366  -0.018  0.98531    
## age_group20:death_typeunnatural -0.1915827  0.1440851  -1.330  0.18399    
## age_group30:death_typeunnatural -0.0252587  0.1410809  -0.179  0.85795    
## age_group40:death_typeunnatural -0.0343547  0.1252681  -0.274  0.78396    
## age_group50:death_typeunnatural -0.1337441  0.1183163  -1.130  0.25863    
## age_group60:death_typeunnatural -0.0985255  0.1169552  -0.842  0.39979    
## age_group80:death_typeunnatural -0.1499235  0.1799365  -0.833  0.40497    
## age_group90:death_typeunnatural -0.3572365  0.2769449  -1.290  0.19743    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2559 on 845 degrees of freedom
## Multiple R-squared:  0.2383, Adjusted R-squared:  0.2157 
## F-statistic: 10.57 on 25 and 845 DF,  p-value: < 2.2e-16
## 
## 
## 
## #####################################################
## REGRESSION MODEL
## Medium: NEWS
## Dependent variable: long-term boost
## Transformation on dependent variable: RELATIVE RANKS
## #####################################################
## 
## Call:
## lm(formula = as.formula(sprintf("perm_boost%s ~ %s", suffix, 
##     predictors)), data = lmdata)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.63192 -0.21951  0.00156  0.23031  0.57993 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## mean_before_relrank             -0.039227   0.033372  -1.175 0.240153    
## age_group70                      0.070331   0.022736   3.093 0.002045 ** 
## age_group20                      0.096091   0.094853   1.013 0.311325    
## age_group30                      0.041532   0.085085   0.488 0.625588    
## age_group40                      0.014224   0.048373   0.294 0.768798    
## age_group50                     -0.025681   0.034744  -0.739 0.460022    
## age_group60                     -0.022031   0.024720  -0.891 0.373062    
## age_group80                      0.062061   0.022358   2.776 0.005630 ** 
## age_group90                      0.112468   0.030095   3.737 0.000199 ***
## death_typeunnatural              0.147611   0.106906   1.381 0.167721    
## genderFemale                     0.038463   0.025803   1.491 0.136437    
## anglonon_anglo                  -0.113644   0.026591  -4.274 2.14e-05 ***
## anglounknown                    -0.149934   0.030603  -4.899 1.15e-06 ***
## type_groupacademia/engineering  -0.019377   0.070374  -0.275 0.783122    
## type_groupgeneral fame          -0.008856   0.044371  -0.200 0.841847    
## type_groupknown for death       -0.064415   0.035556  -1.812 0.070396 .  
## type_groupleadership            -0.070072   0.029587  -2.368 0.018094 *  
## type_groupsports                -0.044702   0.029831  -1.498 0.134378    
## age_group20:death_typeunnatural -0.027778   0.154966  -0.179 0.857782    
## age_group30:death_typeunnatural -0.004357   0.151735  -0.029 0.977101    
## age_group40:death_typeunnatural  0.026451   0.134728   0.196 0.844400    
## age_group50:death_typeunnatural -0.098592   0.127251  -0.775 0.438686    
## age_group60:death_typeunnatural  0.059818   0.125787   0.476 0.634521    
## age_group80:death_typeunnatural -0.009561   0.193525  -0.049 0.960608    
## age_group90:death_typeunnatural -0.288201   0.297859  -0.968 0.333534    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2753 on 845 degrees of freedom
## Multiple R-squared:  0.1189, Adjusted R-squared:  0.09283 
## F-statistic: 4.561 on 25 and 845 DF,  p-value: 2.333e-12

## 
## 
## #####################################################
## REGRESSION MODEL
## Medium: TWITTER
## Dependent variable: short-term boost
## Transformation on dependent variable: RELATIVE RANKS
## #####################################################
## 
## Call:
## lm(formula = as.formula(sprintf("peak_mean_boost%s ~ %s", suffix, 
##     predictors)), data = lmdata)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.71524 -0.18611 -0.00282  0.20651  0.61822 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## mean_before_relrank              0.334205   0.031828  10.500  < 2e-16 ***
## age_group70                     -0.033041   0.021488  -1.538 0.124511    
## age_group20                      0.237288   0.089330   2.656 0.008049 ** 
## age_group30                      0.175763   0.080115   2.194 0.028516 *  
## age_group40                      0.035998   0.045584   0.790 0.429916    
## age_group50                      0.020332   0.032761   0.621 0.535032    
## age_group60                      0.005842   0.023277   0.251 0.801897    
## age_group80                     -0.026348   0.021223  -1.241 0.214778    
## age_group90                     -0.025258   0.028355  -0.891 0.373308    
## death_typeunnatural              0.108666   0.100663   1.079 0.280675    
## genderFemale                    -0.007697   0.024242  -0.318 0.750933    
## anglonon_anglo                  -0.033544   0.024986  -1.342 0.179797    
## anglounknown                    -0.109642   0.029019  -3.778 0.000169 ***
## type_groupacademia/engineering   0.118763   0.066238   1.793 0.073337 .  
## type_groupgeneral fame           0.047796   0.041796   1.144 0.253129    
## type_groupknown for death       -0.026487   0.033731  -0.785 0.432540    
## type_groupleadership             0.026762   0.027648   0.968 0.333343    
## type_groupsports                 0.023447   0.028200   0.831 0.405954    
## age_group20:death_typeunnatural -0.078726   0.145910  -0.540 0.589650    
## age_group30:death_typeunnatural  0.001729   0.142781   0.012 0.990343    
## age_group40:death_typeunnatural  0.082247   0.126535   0.650 0.515874    
## age_group50:death_typeunnatural -0.030965   0.119968  -0.258 0.796386    
## age_group60:death_typeunnatural -0.014870   0.118442  -0.126 0.900118    
## age_group80:death_typeunnatural -0.166931   0.182080  -0.917 0.359509    
## age_group90:death_typeunnatural -0.125260   0.280592  -0.446 0.655414    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2591 on 845 degrees of freedom
## Multiple R-squared:  0.2191, Adjusted R-squared:  0.196 
## F-statistic: 9.482 on 25 and 845 DF,  p-value: < 2.2e-16
## 
## 
## 
## #####################################################
## REGRESSION MODEL
## Medium: TWITTER
## Dependent variable: long-term boost
## Transformation on dependent variable: RELATIVE RANKS
## #####################################################
## 
## Call:
## lm(formula = as.formula(sprintf("perm_boost%s ~ %s", suffix, 
##     predictors)), data = lmdata)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.70715 -0.19327  0.02603  0.21907  0.59020 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## mean_before_relrank              0.116011   0.033760   3.436 0.000618 ***
## age_group70                      0.038062   0.022793   1.670 0.095301 .  
## age_group20                      0.071226   0.094752   0.752 0.452437    
## age_group30                      0.179281   0.084978   2.110 0.035174 *  
## age_group40                      0.097324   0.048351   2.013 0.044444 *  
## age_group50                     -0.009159   0.034750  -0.264 0.792173    
## age_group60                     -0.008024   0.024690  -0.325 0.745276    
## age_group80                      0.023841   0.022511   1.059 0.289860    
## age_group90                      0.007153   0.030076   0.238 0.812081    
## death_typeunnatural              0.122294   0.106773   1.145 0.252381    
## genderFemale                     0.029356   0.025713   1.142 0.253903    
## anglonon_anglo                  -0.034279   0.026503  -1.293 0.196225    
## anglounknown                    -0.141044   0.030780  -4.582 5.29e-06 ***
## type_groupacademia/engineering   0.076314   0.070259   1.086 0.277705    
## type_groupgeneral fame           0.003185   0.044332   0.072 0.942744    
## type_groupknown for death       -0.024310   0.035778  -0.679 0.497031    
## type_groupleadership            -0.101816   0.029326  -3.472 0.000543 ***
## type_groupsports                -0.023928   0.029912  -0.800 0.423974    
## age_group20:death_typeunnatural  0.090805   0.154766   0.587 0.557546    
## age_group30:death_typeunnatural -0.082225   0.151447  -0.543 0.587321    
## age_group40:death_typeunnatural -0.019321   0.134215  -0.144 0.885570    
## age_group50:death_typeunnatural -0.079483   0.127249  -0.625 0.532389    
## age_group60:death_typeunnatural -0.001916   0.125630  -0.015 0.987838    
## age_group80:death_typeunnatural -0.006991   0.193131  -0.036 0.971132    
## age_group90:death_typeunnatural -0.144877   0.297622  -0.487 0.626539    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2749 on 845 degrees of freedom
## Multiple R-squared:  0.1214, Adjusted R-squared:  0.0954 
## F-statistic:  4.67 on 25 and 845 DF,  p-value: 8.867e-13

Models of news-vs.-Twitter boost difference for fixed person (without interactions)

for (ranks in c(FALSE, TRUE)) {
  mods <- run_regression_analysis_T_vs_N(ranks)
  
  print_regression_table_T_vs_N(mods, ranks)
  
  print_model_description('NEWS vs. TWITTER', 'short-term boost', ranks)
  print(summary(mods$mod_peak))
  
  print_model_description('NEWS vs. TWITTER', 'long-term boost', ranks)
  print(summary(mods$mod_perm))
}
## 
## 
## #####################################################
## REGRESSION MODEL
## Medium: NEWS vs. TWITTER
## Dependent variable: short-term boost
## Transformation on dependent variable: NONE
## #####################################################
## 
## Call:
## lm(formula = as.formula(sprintf("peak_mean_boost%s_diff ~ %s", 
##     suffix, predictors)), data = lmdata)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.65313 -0.36479  0.00554  0.34318  2.14224 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    -0.42705    0.04688  -9.109  < 2e-16 ***
## mean_before_relrank_diff       -0.21235    0.08346  -2.544  0.01112 *  
## age_group20                    -0.57747    0.12582  -4.590 5.11e-06 ***
## age_group30                    -0.23483    0.12379  -1.897  0.05816 .  
## age_group40                    -0.37445    0.09305  -4.024 6.22e-05 ***
## age_group50                    -0.20387    0.07324  -2.783  0.00550 ** 
## age_group60                    -0.17498    0.06057  -2.889  0.00396 ** 
## age_group80                     0.01435    0.05778   0.248  0.80388    
## age_group90                     0.16419    0.07233   2.270  0.02346 *  
## death_typeunnatural             0.34814    0.06995   4.977 7.82e-07 ***
## genderFemale                    0.09071    0.05323   1.704  0.08870 .  
## anglonon_anglo                 -0.21941    0.05437  -4.036 5.93e-05 ***
## anglounknown                   -0.05238    0.06307  -0.831  0.40646    
## type_groupacademia/engineering -0.04838    0.14619  -0.331  0.74076    
## type_groupgeneral fame         -0.01644    0.09169  -0.179  0.85776    
## type_groupknown for death       0.10452    0.07390   1.414  0.15760    
## type_groupleadership            0.19975    0.06208   3.217  0.00134 ** 
## type_groupsports                0.05875    0.06191   0.949  0.34292    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5712 on 852 degrees of freedom
## Multiple R-squared:  0.1013, Adjusted R-squared:  0.08336 
## F-statistic: 5.648 on 17 and 852 DF,  p-value: 2.893e-12
## 
## 
## 
## #####################################################
## REGRESSION MODEL
## Medium: NEWS vs. TWITTER
## Dependent variable: long-term boost
## Transformation on dependent variable: NONE
## #####################################################
## 
## Call:
## lm(formula = as.formula(sprintf("perm_boost%s_diff ~ %s", suffix, 
##     predictors)), data = lmdata)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.25725 -0.05774  0.00741  0.06234  0.70460 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    -0.014983   0.014259  -1.051 0.293665    
## mean_before_relrank_diff       -0.034104   0.025383  -1.344 0.179447    
## age_group20                    -0.134769   0.038268  -3.522 0.000452 ***
## age_group30                    -0.089151   0.037649  -2.368 0.018110 *  
## age_group40                    -0.101234   0.028300  -3.577 0.000367 ***
## age_group50                    -0.028959   0.022277  -1.300 0.193956    
## age_group60                    -0.020521   0.018421  -1.114 0.265602    
## age_group80                    -0.006351   0.017573  -0.361 0.717893    
## age_group90                     0.015400   0.021998   0.700 0.484072    
## death_typeunnatural            -0.008434   0.021275  -0.396 0.691877    
## genderFemale                    0.028585   0.016189   1.766 0.077796 .  
## anglonon_anglo                 -0.023095   0.016535  -1.397 0.162868    
## anglounknown                    0.012408   0.019182   0.647 0.517899    
## type_groupacademia/engineering -0.045961   0.044463  -1.034 0.301570    
## type_groupgeneral fame          0.002200   0.027887   0.079 0.937150    
## type_groupknown for death      -0.015268   0.022475  -0.679 0.497109    
## type_groupleadership           -0.006173   0.018882  -0.327 0.743791    
## type_groupsports                0.009053   0.018829   0.481 0.630790    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1737 on 852 degrees of freedom
## Multiple R-squared:  0.05249,    Adjusted R-squared:  0.03358 
## F-statistic: 2.776 on 17 and 852 DF,  p-value: 0.0001535
## 
## 
## 
## #####################################################
## REGRESSION MODEL
## Medium: NEWS vs. TWITTER
## Dependent variable: short-term boost
## Transformation on dependent variable: RELATIVE RANKS
## #####################################################
## 
## Call:
## lm(formula = as.formula(sprintf("peak_mean_boost%s_diff ~ %s", 
##     suffix, predictors)), data = lmdata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.6496 -0.1208  0.0038  0.1106  0.7404 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     0.007131   0.016294   0.438 0.661761    
## mean_before_relrank_diff       -0.077831   0.029007  -2.683 0.007434 ** 
## age_group20                    -0.177328   0.043731  -4.055 5.47e-05 ***
## age_group30                    -0.091639   0.043024  -2.130 0.033461 *  
## age_group40                    -0.112473   0.032340  -3.478 0.000531 ***
## age_group50                    -0.058774   0.025457  -2.309 0.021195 *  
## age_group60                    -0.061181   0.021051  -2.906 0.003753 ** 
## age_group80                     0.012762   0.020081   0.636 0.525260    
## age_group90                     0.072718   0.025139   2.893 0.003917 ** 
## death_typeunnatural             0.132598   0.024312   5.454 6.46e-08 ***
## genderFemale                    0.030131   0.018500   1.629 0.103744    
## anglonon_anglo                 -0.077795   0.018896  -4.117 4.21e-05 ***
## anglounknown                   -0.017957   0.021921  -0.819 0.412906    
## type_groupacademia/engineering -0.026667   0.050811  -0.525 0.599843    
## type_groupgeneral fame         -0.011339   0.031869  -0.356 0.722071    
## type_groupknown for death       0.028830   0.025684   1.123 0.261965    
## type_groupleadership            0.060712   0.021578   2.814 0.005011 ** 
## type_groupsports                0.004572   0.021517   0.212 0.831794    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1985 on 852 degrees of freedom
## Multiple R-squared:  0.1042, Adjusted R-squared:  0.08629 
## F-statistic: 5.827 on 17 and 852 DF,  p-value: 9.067e-13
## 
## 
## 
## #####################################################
## REGRESSION MODEL
## Medium: NEWS vs. TWITTER
## Dependent variable: long-term boost
## Transformation on dependent variable: RELATIVE RANKS
## #####################################################
## 
## Call:
## lm(formula = as.formula(sprintf("perm_boost%s_diff ~ %s", suffix, 
##     predictors)), data = lmdata)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.92186 -0.12959  0.01391  0.14260  0.95349 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     0.008043   0.022742   0.354  0.72367    
## mean_before_relrank_diff       -0.230628   0.040485  -5.697 1.68e-08 ***
## age_group20                    -0.105519   0.061035  -1.729  0.08420 .  
## age_group30                    -0.128012   0.060048  -2.132  0.03331 *  
## age_group40                    -0.106048   0.045137  -2.349  0.01903 *  
## age_group50                    -0.043230   0.035530  -1.217  0.22405    
## age_group60                    -0.026262   0.029381  -0.894  0.37165    
## age_group80                     0.004226   0.028027   0.151  0.88019    
## age_group90                     0.080720   0.035086   2.301  0.02165 *  
## death_typeunnatural             0.039642   0.033933   1.168  0.24303    
## genderFemale                    0.007763   0.025820   0.301  0.76373    
## anglonon_anglo                 -0.074869   0.026373  -2.839  0.00464 ** 
## anglounknown                    0.023978   0.030594   0.784  0.43341    
## type_groupacademia/engineering -0.069726   0.070916  -0.983  0.32578    
## type_groupgeneral fame          0.003712   0.044479   0.083  0.93352    
## type_groupknown for death       0.001229   0.035846   0.034  0.97266    
## type_groupleadership            0.066667   0.030116   2.214  0.02711 *  
## type_groupsports                0.008463   0.030031   0.282  0.77816    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2771 on 852 degrees of freedom
## Multiple R-squared:  0.0743, Adjusted R-squared:  0.05583 
## F-statistic: 4.023 on 17 and 852 DF,  p-value: 9.057e-08

Models of news-vs.-Twitter boost difference for fixed person (with “age by manner of death” interaction)

for (ranks in c(FALSE, TRUE)) {
  mods <- run_regression_analysis_T_vs_N_for_death_types(ranks)
  # Short-term.
  print_model_description('NEWS vs. TWITTER', 'short-term boost', ranks)
  print(summary(mods$mod_peak))
  # Long-term.
  print_model_description('NEWS vs. TWITTER', 'long-term boost', ranks)
  print(summary(mods$mod_perm))
  # Plots.
  par(mfrow=c(1,2))
  plot_age_coeffs_for_death_types(mods$mod_peak, NA, 'peak_mean_boost_diff', BASE_AGE, ranks,
                                  if (ranks) c(-0.45, 0.45) else c(-1.5, 0.5), 'black')
  plot_age_coeffs_for_death_types(mods$mod_perm, NA, 'perm_boost_diff', BASE_AGE, ranks,
                                  if (ranks) c(-0.45, 0.45) else c(-0.5, 0.5), 'black')
}
## 
## 
## #####################################################
## REGRESSION MODEL
## Medium: NEWS vs. TWITTER
## Dependent variable: short-term boost
## Transformation on dependent variable: NONE
## #####################################################
## 
## Call:
## lm(formula = as.formula(sprintf("peak_mean_boost%s_diff ~ %s", 
##     suffix, predictors)), data = lmdata)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.65119 -0.36422  0.00888  0.34055  2.14507 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## mean_before_relrank_diff        -0.21682    0.08400  -2.581  0.01001 *  
## age_group70                     -0.42927    0.04758  -9.022  < 2e-16 ***
## age_group20                     -0.82078    0.19722  -4.162 3.48e-05 ***
## age_group30                     -0.80494    0.17718  -4.543 6.35e-06 ***
## age_group40                     -0.78266    0.10074  -7.769 2.28e-14 ***
## age_group50                     -0.63262    0.07226  -8.755  < 2e-16 ***
## age_group60                     -0.60094    0.05132 -11.711  < 2e-16 ***
## age_group80                     -0.41640    0.04659  -8.938  < 2e-16 ***
## age_group90                     -0.26085    0.06271  -4.160 3.51e-05 ***
## death_typeunnatural              0.40697    0.22256   1.829  0.06782 .  
## genderFemale                     0.09575    0.05356   1.788  0.07419 .  
## anglonon_anglo                  -0.21719    0.05509  -3.942 8.74e-05 ***
## anglounknown                    -0.05533    0.06348  -0.872  0.38367    
## type_groupacademia/engineering  -0.04801    0.14652  -0.328  0.74326    
## type_groupgeneral fame          -0.01437    0.09234  -0.156  0.87634    
## type_groupknown for death        0.10249    0.07432   1.379  0.16825    
## type_groupleadership             0.19868    0.06232   3.188  0.00148 ** 
## type_groupsports                 0.05601    0.06229   0.899  0.36880    
## age_group20:death_typeunnatural -0.32913    0.32231  -1.021  0.30746    
## age_group30:death_typeunnatural  0.17762    0.31605   0.562  0.57426    
## age_group40:death_typeunnatural -0.11676    0.28006  -0.417  0.67686    
## age_group50:death_typeunnatural -0.04974    0.26513  -0.188  0.85125    
## age_group60:death_typeunnatural -0.06810    0.26168  -0.260  0.79474    
## age_group80:death_typeunnatural  0.16940    0.40237   0.421  0.67385    
## age_group90:death_typeunnatural -0.37706    0.61904  -0.609  0.54262    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5724 on 845 degrees of freedom
## Multiple R-squared:  0.4346, Adjusted R-squared:  0.4178 
## F-statistic: 25.98 on 25 and 845 DF,  p-value: < 2.2e-16
## 
## 
## 
## #####################################################
## REGRESSION MODEL
## Medium: NEWS vs. TWITTER
## Dependent variable: long-term boost
## Transformation on dependent variable: NONE
## #####################################################
## 
## Call:
## lm(formula = as.formula(sprintf("perm_boost%s_diff ~ %s", suffix, 
##     predictors)), data = lmdata)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.20604 -0.05399  0.00862  0.05958  0.65987 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## mean_before_relrank_diff        -0.0387605  0.0254315  -1.524  0.12785    
## age_group70                     -0.0127546  0.0144053  -0.885  0.37618    
## age_group20                     -0.0266540  0.0597136  -0.446  0.65545    
## age_group30                     -0.1643295  0.0536460  -3.063  0.00226 ** 
## age_group40                     -0.1367607  0.0305002  -4.484 8.34e-06 ***
## age_group50                     -0.0394232  0.0218780  -1.802  0.07191 .  
## age_group60                     -0.0364275  0.0155368  -2.345  0.01928 *  
## age_group80                     -0.0201342  0.0141055  -1.427  0.15383    
## age_group90                      0.0010249  0.0189868   0.054  0.95697    
## death_typeunnatural             -0.0496287  0.0673856  -0.736  0.46164    
## genderFemale                     0.0285184  0.0162164   1.759  0.07900 .  
## anglonon_anglo                  -0.0235607  0.0166806  -1.412  0.15818    
## anglounknown                     0.0105550  0.0192186   0.549  0.58301    
## type_groupacademia/engineering  -0.0463112  0.0443633  -1.044  0.29683    
## type_groupgeneral fame           0.0001484  0.0279576   0.005  0.99577    
## type_groupknown for death       -0.0162957  0.0225018  -0.724  0.46915    
## type_groupleadership            -0.0049896  0.0188673  -0.264  0.79149    
## type_groupsports                 0.0050388  0.0188582   0.267  0.78939    
## age_group20:death_typeunnatural -0.1369124  0.0975852  -1.403  0.16098    
## age_group30:death_typeunnatural  0.1441439  0.0956893   1.506  0.13234    
## age_group40:death_typeunnatural  0.1064914  0.0847944   1.256  0.20951    
## age_group50:death_typeunnatural  0.0275709  0.0802742   0.343  0.73134    
## age_group60:death_typeunnatural  0.0599112  0.0792287   0.756  0.44975    
## age_group80:death_typeunnatural  0.0063933  0.1218248   0.052  0.95816    
## age_group90:death_typeunnatural  0.0029407  0.1874274   0.016  0.98749    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1733 on 845 degrees of freedom
## Multiple R-squared:  0.101,  Adjusted R-squared:  0.07445 
## F-statistic: 3.799 on 25 and 845 DF,  p-value: 1.803e-09

## 
## 
## #####################################################
## REGRESSION MODEL
## Medium: NEWS vs. TWITTER
## Dependent variable: short-term boost
## Transformation on dependent variable: RELATIVE RANKS
## #####################################################
## 
## Call:
## lm(formula = as.formula(sprintf("peak_mean_boost%s_diff ~ %s", 
##     suffix, predictors)), data = lmdata)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.64846 -0.11978  0.00327  0.10887  0.74189 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## mean_before_relrank_diff        -0.079660   0.029192  -2.729 0.006487 ** 
## age_group70                      0.005902   0.016535   0.357 0.721241    
## age_group20                     -0.131089   0.068543  -1.913 0.056148 .  
## age_group30                     -0.142074   0.061578  -2.307 0.021283 *  
## age_group40                     -0.100671   0.035010  -2.876 0.004135 ** 
## age_group50                     -0.052507   0.025113  -2.091 0.036838 *  
## age_group60                     -0.051830   0.017834  -2.906 0.003754 ** 
## age_group80                      0.018036   0.016191   1.114 0.265611    
## age_group90                      0.080562   0.021794   3.696 0.000233 ***
## death_typeunnatural              0.155420   0.077349   2.009 0.044819 *  
## genderFemale                     0.032534   0.018614   1.748 0.080860 .  
## anglonon_anglo                  -0.076134   0.019147  -3.976  7.6e-05 ***
## anglounknown                    -0.018445   0.022060  -0.836 0.403316    
## type_groupacademia/engineering  -0.026851   0.050923  -0.527 0.598128    
## type_groupgeneral fame          -0.010738   0.032091  -0.335 0.738007    
## type_groupknown for death        0.027646   0.025829   1.070 0.284766    
## type_groupleadership             0.060264   0.021657   2.783 0.005512 ** 
## type_groupsports                 0.004710   0.021646   0.218 0.827786    
## age_group20:death_typeunnatural -0.081471   0.112014  -0.727 0.467224    
## age_group30:death_typeunnatural  0.071281   0.109837   0.649 0.516537    
## age_group40:death_typeunnatural -0.038913   0.097332  -0.400 0.689404    
## age_group50:death_typeunnatural -0.020927   0.092143  -0.227 0.820392    
## age_group60:death_typeunnatural -0.046500   0.090943  -0.511 0.609270    
## age_group80:death_typeunnatural  0.071107   0.139837   0.508 0.611238    
## age_group90:death_typeunnatural -0.166528   0.215139  -0.774 0.439120    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1989 on 845 degrees of freedom
## Multiple R-squared:  0.1079, Adjusted R-squared:  0.08154 
## F-statistic: 4.089 on 25 and 845 DF,  p-value: 1.466e-10
## 
## 
## 
## #####################################################
## REGRESSION MODEL
## Medium: NEWS vs. TWITTER
## Dependent variable: long-term boost
## Transformation on dependent variable: RELATIVE RANKS
## #####################################################
## 
## Call:
## lm(formula = as.formula(sprintf("perm_boost%s_diff ~ %s", suffix, 
##     predictors)), data = lmdata)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.92420 -0.13055  0.01538  0.14045  0.95719 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## mean_before_relrank_diff        -0.2341465  0.0407524  -5.746 1.28e-08 ***
## age_group70                      0.0106514  0.0230835   0.461  0.64461    
## age_group20                      0.0031530  0.0956873   0.033  0.97372    
## age_group30                     -0.1718587  0.0859643  -1.999  0.04591 *  
## age_group40                     -0.1054249  0.0488746  -2.157  0.03128 *  
## age_group50                     -0.0333427  0.0350581  -0.951  0.34184    
## age_group60                     -0.0213921  0.0248968  -0.859  0.39046    
## age_group80                      0.0133539  0.0226032   0.591  0.55481    
## age_group90                      0.0899990  0.0304251   2.958  0.00318 ** 
## death_typeunnatural             -0.0135298  0.1079812  -0.125  0.90032    
## genderFemale                     0.0080192  0.0259857   0.309  0.75770    
## anglonon_anglo                  -0.0751648  0.0267296  -2.812  0.00504 ** 
## anglounknown                     0.0227608  0.0307967   0.739  0.46007    
## type_groupacademia/engineering  -0.0697905  0.0710894  -0.982  0.32651    
## type_groupgeneral fame           0.0031289  0.0448002   0.070  0.94434    
## type_groupknown for death       -0.0002293  0.0360578  -0.006  0.99493    
## type_groupleadership             0.0668489  0.0302337   2.211  0.02730 *  
## type_groupsports                 0.0050713  0.0302191   0.168  0.86677    
## age_group20:death_typeunnatural -0.0923192  0.1563740  -0.590  0.55510    
## age_group30:death_typeunnatural  0.1416421  0.1533360   0.924  0.35589    
## age_group40:death_typeunnatural  0.0782650  0.1358777   0.576  0.56477    
## age_group50:death_typeunnatural  0.0504094  0.1286343   0.392  0.69524    
## age_group60:death_typeunnatural  0.0908410  0.1269589   0.716  0.47449    
## age_group80:death_typeunnatural  0.0187045  0.1952165   0.096  0.92369    
## age_group90:death_typeunnatural -0.0496572  0.3003405  -0.165  0.86872    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2777 on 845 degrees of freedom
## Multiple R-squared:  0.07775,    Adjusted R-squared:  0.05046 
## F-statistic: 2.849 on 25 and 845 DF,  p-value: 4.86e-06

Document length

regdata <- load_doc_length_regression_data()
# Only constant, i.e., estimate mean (plus standard errors).
plot_doc_length_increase(regdata, '1', 'Direct estimate')
# Adjust for those predictors that were found to have a significant impact on short-term
# post-mortem mention frequency in the news.
# (All possible predictors: mean_before_relrank, age_group, death_type, gender, anglo, type_group.)
plot_doc_length_increase(regdata, 'mean_before_relrank + age_group + death_type + anglo', 'Adjusted estimate')

Figure 12: Relative length increase of documents that mention deceased public figures, with respect to the respective public figure’s pre-mortem mean document length, as a function of days since death. All means in this analysis are geometric means. Error bars are 95% confidence intervals approximated as \(\pm 2\) standard errors. Left: Direct mean estimates of relative length increase. Right: Estimates adjusted for population drift (since certain groups of people are more likely to be mentioned post-mortem than others). Each day \(t\)’s estimate was obtained from a separate linear regression model for day \(t\) only, which included each person \(i\) mentioned that day as a data point, with \(\log(L_{it}/P_i)\) as the dependent variable (where \(L_{it}\) is \(i\)’s mean document length on day \(t\), and \(P_i\) is \(i\)’s pre-mortem mean document length), and with independent variables defined by the predictors that were previously found to be significantly associated with short-term post-mortem mention frequency (language, pre-mortem mean mention frequency rank, manner of death, age group), such that estimates are for anglophones of median pre-mortem popularity who died an unnatural death at age 70-79 years. The adjusted estimate of the (geometric) mean relative length increase for day \(t\) is given by \(e^{a_t}-1\), where \(a_t\) is the intercept of the regression for day \(t\). We see that, both with and without adjustment, the documents that mentioned a person on the day of their death were on average about 40% shorter than documents that mentioned the person before their death, and that the pre-mortem level is reached again after about one month.